USRP_Server  2.0
A flexible, GPU-accelerated radio-frequency readout software.
kernels.cu
Go to the documentation of this file.
1 #include "kernels.cuh"
2 
3 __global__ void DIRECT_decimator(
4  uint single_tone_length,
5  size_t total_length,
6  float2* __restrict intput,
7  float2* __restrict output
8 );
9 
10 //Direct demodulation kernel. This kernel takes the raw input from the SDR and separate channels. Note: does not do any filtering.
11 __global__ void direct_demodulator_fp64(
12  double* __restrict tone_frquencies,
13  size_t index_counter,
14  uint single_tone_length,
15  size_t total_length,
16  float2* __restrict intput,
17  float2* __restrict output
18 ){
19  double _i,_q;
20  double tone_calculated_phase;
21  uint input_index;
22 
23  for(uint i = blockIdx.x * blockDim.x + threadIdx.x;
24  i < total_length;
25  i += gridDim.x*blockDim.x
26  ){
27 
28  input_index = i % single_tone_length;
29 
30  //here's the core: reading from the pointer should be automatically cached.
31  tone_calculated_phase = 2. * tone_frquencies[i/single_tone_length] * (index_counter + input_index);
32 
33  //generate sine and cosine
34  sincospi(tone_calculated_phase,&_q,&_i);
35 
36  //demodulate
37  output[i].x = intput[input_index].x * _i + intput[input_index].y * _q;
38  output[i].y = intput[input_index].y * _i - intput[input_index].x * _q;
39 
40  }
41 
42 }
43 
44 
46  int* __restrict tone_frequencies,
47  int* __restrict tone_phases,
48  int wavetablelen,
49  size_t index_counter,
50  size_t single_tone_length,
51  size_t total_length,
52  float2* __restrict input,
53  float2* __restrict output
54 ){
55  double _i,_q;
56  double tone_calculated_phase;
57  size_t input_index;
58 
59  for(size_t i = blockIdx.x * blockDim.x + threadIdx.x;
60  i < total_length;
61  i += gridDim.x*blockDim.x
62  ){
63  input_index = i % single_tone_length;
64  size_t ch = i / single_tone_length;
65  long long int tf = tone_frequencies[ch];
66  long long int ii = (input_index + index_counter)%wavetablelen;
67  long long int tp = tone_phases[ch];
68  long long int my_phase = (tp + (tf * ii)%wavetablelen);
69  tone_calculated_phase = 2. * (my_phase / (double)wavetablelen);
70 
71  //generate sine and cosine
72  sincospi(tone_calculated_phase,&_q,&_i);
73 
74  //demodulate (strided acces is due to the way python inteprets packets)
75  //output[input_index*2+ch].y = input_index;//input[input_index].y * _i - input[input_index].x * _q;
76  //output[input_index*2+ch].x = tf;//input[input_index].x * _i + input[input_index].y * _q;
77 
78  //multichannel diagnostic
79  //output[i].y = input_index;
80  //output[i].x = tf;
81 
82  output[i].y = input[input_index].y * _i - input[input_index].x * _q;
83  output[i].x = input[input_index].x * _i + input[input_index].y * _q;
84 
85  }
86 }
87 
88 //Wrapper for the direct demodulation
90  int* __restrict tone_frequencies,
91  int* __restrict tone_phases,
92  int wavetablelen,
93  size_t index_counter,
94  size_t single_tone_length,
95  size_t total_length,
96  float2* __restrict input,
97  float2* __restrict output,
98  cudaStream_t internal_stream){
99 
100  direct_demodulator_integer<<<1024,32,0,internal_stream>>>(tone_frequencies,tone_phases,wavetablelen,index_counter,single_tone_length,total_length,input,output);
101 }
102 
103 //allocates memory on gpu and fills with a real hamming window. returns a pointer to the window on the device.
104 //note that this is a host function that wraps some device calls
105 //TODO: the window function should be made in a class not in a function. Once we have the class we can put the class template directly in the header file to avoid undefined reference to specialized templates during linking.
106 template <typename T>
107 T* make_hamming_window(int length, int side, bool diagnostic, bool host_ret){
108 
109  T *d_win,*h_win = (T*)malloc(length*sizeof(T));
110 
111  //allocate some memory on the GPU
112  cudaMalloc((void **)&d_win, length*sizeof(T));
113 
114 
115  //initialize the accumulator used for normalization
116  float scale = 0;
117 
118  for(int i = 0; i < side; i++){
119  h_win[i].y = 0;
120  h_win[i].x = 0;
121  }
122  for(int i = length - side; i < length; i++){
123  h_win[i].y = 0;
124  h_win[i].x = 0;
125  }
126  for(int i = 0; i < length - side; i++){
127  h_win[i+side].y = 0;
128 
129  //make hamming
130  h_win[i+side].x = (0.54-0.46*cos(2.f*pi_f*i/((length-side)-1)));
131  scale += h_win[i+side].x;
132 
133  }
134  //normalize the window
135  for(int i = 0; i < length; i++) h_win[i].x /= scale;
136 
137  //upload the window on the GPU
138  cudaMemcpy(d_win, h_win, length*sizeof(T),cudaMemcpyHostToDevice);
139 
140  if(diagnostic){
141  //write a diagnostic binary file containing the window.
142  //TODO there hsould be a single hdf5 file containing all the diagnostic informations
143  FILE* window_diagnostic = fopen("USRP_hamming_filter_window.dat", "wb");
144  fwrite(static_cast<void*>(h_win), length, sizeof(T), window_diagnostic);
145  fclose(window_diagnostic);
146  }
147 
148  //cleanup
149  free(h_win);
150  cudaDeviceSynchronize();
151  return d_win;
152 }
153 //specializing the template to avoid error during the linking process
154 template <>
155 float2* make_hamming_window<float2>(int length, int side, bool diagnostic, bool host_ret){
156 
157  float2 *ret,*d_win,*h_win = (float2*)malloc(length*sizeof(float2));
158 
159  //initialize the accumulator used for normalization
160  float scale = 0;
161 
162  for(int i = 0; i < side; i++){
163  h_win[i].y = 0;
164  h_win[i].x = 0;
165  }
166  for(int i = length - side; i < length; i++){
167  h_win[i].y = 0;
168  h_win[i].x = 0;
169  }
170  for(int i = 0; i < length - side; i++){
171  h_win[i+side].y = 0;
172 
173  //make hamming
174  h_win[i+side].x = (0.54-0.46*cos(2.f*pi_f*i/((length-side)-1)));
175  scale += h_win[i+side].x;
176 
177  }
178 
179  //normalize the window
180  for(int i = 0; i < length; i++) h_win[i].x /= scale;
181 
182  if(diagnostic){
183  //write a diagnostic binary file containing the window.
184  //TODO there hsould be a single hdf5 file containing all the diagnostic informations
185  FILE* window_diagnostic = fopen("USRP_hamming_filter_window.dat", "wb");
186  fwrite(static_cast<void*>(h_win), length, sizeof(float2), window_diagnostic);
187  fclose(window_diagnostic);
188  }
189  if (not host_ret){
190 
191  //allocate some memory on the GPU
192  cudaMalloc((void **)&d_win, length*sizeof(float2));
193 
194  //upload the window on the GPU
195  cudaMemcpy(d_win, h_win, length*sizeof(float2),cudaMemcpyHostToDevice);
196 
197  //cleanup
198  free(h_win);
199  cudaDeviceSynchronize();
200  ret = d_win;
201  }else{
202  ret = h_win;
203  }
204 
205  return ret;
206 }
207 
208 float2* make_flat_window(int length, int side, bool diagnostic){
209 
210  float2 *d_win,*h_win = (float2*)malloc(length*sizeof(float2));
211 
212  //allocate some memory on the GPU
213  cudaMalloc((void **)&d_win, length*sizeof(float2));
214 
215 
216  //initialize the accumulator used for normalization
217  float scale = 0;
218 
219  for(int i = 0; i < side; i++){
220  h_win[i].y = 0;
221  h_win[i].x = 0;
222  }
223  for(int i = length - side; i < length; i++){
224  h_win[i].y = 0;
225  h_win[i].x = 0;
226  }
227  for(int i = 0; i < length - side; i++){
228  //make flat
229  h_win[i+side].x = 1.;
230  h_win[i+side].y = 0.;
231  scale += h_win[i+side].x;
232 
233  }
234 
235  //normalize the window
236  for(int i = 0; i < length; i++) h_win[i].x /= scale;
237 
238  //upload the window on the GPU
239  cudaMemcpy(d_win, h_win, length*sizeof(float2),cudaMemcpyHostToDevice);
240 
241  if(diagnostic){
242  //write a diagnostic binary file containing the window.
243  //TODO there hsould be a single hdf5 file containing all the diagnostic informations
244  FILE* window_diagnostic = fopen("USRP_hamming_filter_window.dat", "wb");
245  fwrite(static_cast<void*>(h_win), length, sizeof(float2), window_diagnostic);
246  fclose(window_diagnostic);
247  }
248 
249  //cleanup
250  free(h_win);
251  cudaDeviceSynchronize();
252  return d_win;
253 }
254 
255 
256 //allocates memory on gpu and fills with a real sinc window. returns a pointer to the window on the device.
257 //note that this is a host function that wraps some device calls
258 float2* make_sinc_window(int length, float fc, bool diagnostic = false, bool host_ret = false){
259 
260  float2 *ret,*d_win,*h_win = (float2*)malloc(length*sizeof(float2));
261 
262  int sinc_index;
263 
264  //initialize the accumulator used for normalization
265  float scale = 0;
266  for(int i = 0; i < length; i++){
267 
268  sinc_index = i - (length-1)/2;
269 
270  //no immaginary part, the window is considered purely real
271  h_win[i].y = 0;
272 
273  //generate sinc
274  sinc_index != 0 ?
275  h_win[i].x = (2.f*fc)*sin(2.f*pi_f*fc*sinc_index)/(2.f*pi_f*fc*sinc_index):
276  h_win[i].x = (2.f*fc);
277 
278  //apply hamming
279  h_win[i].x *= (0.54-0.46*cos(2.f*pi_f*i/(length-1)));
280 
281  scale += h_win[i].x;
282 
283  }
284 
285  //normalize the window
286  for(int i = 0; i < length; i++) h_win[i].x /= scale;
287 
288 
289 
290  if(diagnostic){
291  //write a diagnostic binary file containing the window.
292  //TODO there hsould be a single hdf5 file containing all the diagnostic informations
293  FILE* window_diagnostic = fopen("USRP_polyphase_filter_window.dat", "wb");
294  fwrite(static_cast<void*>(h_win), length, sizeof(float2), window_diagnostic);
295  fclose(window_diagnostic);
296  }
297  if(not host_ret){
298  //allocate some memory on the GPU
299  cudaMalloc((void **)&d_win, length*sizeof(float2));
300  //upload the window on the GPU
301  cudaMemcpy(d_win, h_win, length*sizeof(float2),cudaMemcpyHostToDevice);
302  //cleanup
303  free(h_win);
304  cudaDeviceSynchronize();
305  ret = d_win;
306  }else{
307  ret = h_win;
308  }
309  return ret;
310 }
311 
312 __global__ void init_states(curandState *state, int twice_vector_len){
313 
314  for(int i = blockIdx.x * blockDim.x + threadIdx.x;
315  i < twice_vector_len;
316  i += gridDim.x*blockDim.x
317  ){
318 
319  curand_init(1337,i,0,&state[i]);
320  }
321 }
322 
323 __global__ void make_rand(curandState *state, float2 *vector, int len, float scale = 1){
324 
325  for(int i = blockIdx.x * blockDim.x + threadIdx.x;
326  i < len;
327  i += gridDim.x*blockDim.x
328  ){
329  vector[i].x = scale * 2*curand_uniform(&state[2*i])-1;
330  vector[i].y = scale * 2*curand_uniform(&state[2*i]+1)-1;
331  }
332 }
333 
334 
335 void print_chirp_params(std::string comment, chirp_parameter cp){
336 
337 }
338 
339 __device__ float modulus(float number, float modulus){
340  return number - __float2int_rd(number/modulus)*modulus;
341 }
342 
343 __device__ unsigned int round_index(unsigned int last_index, unsigned int offset, unsigned int num_f, unsigned int f_len){
344 
345  unsigned long int pos = last_index + offset;
346 
347  unsigned long int chirp_len = f_len * num_f;
348 
349  return (pos - ((pos/chirp_len) * chirp_len));//pos%chirp_len;
350 
351 }
352 
353 
354 //generate a chirp waveform in a gpu buffer
355 __global__ void chirp_gen(
356 
357  float2* __restrict__ output, //pointer to the gpu buffer
358  unsigned int output_size, //size of the buffer
359  chirp_parameter* __restrict__ info, //chirp information
360  unsigned long int last_index,
361  float scale = 1 //scale the amplitude of the chirp
362 
363  ){
364  unsigned long int effective_index; //index relative to the current position in signal generation (a single chirp may be generated in more than one kernel call).
365  unsigned long int frequency_index; //actual frequency step in the chirp generation.
366  unsigned long int phase_correction; //phase correction to allow parallel coherent phase generation.
367  int index; //index to use in sinus and cosinus.
368  for(unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x;
369  offset < output_size;
370  offset += gridDim.x*blockDim.x
371  ){
372 
373  //take in account previous calls and bookeep phase
374  effective_index = (last_index + offset) % (info->num_steps * info->length);
375  //effective_index = round_index(info->last_index,offset,info->num_steps,info->length);
376  //effective_index = round_index
377 
378  //calculate current frequency to generate
379  frequency_index = effective_index/info->length;
380 
381  unsigned long int q_phase = (frequency_index/2)*(frequency_index +1) + (frequency_index % 2)*((frequency_index +1)/2);
382 
383  //correct the pahse. needed for parallel chirp generation.
384  phase_correction = ( info->chirpness * (info->length * q_phase) );
385 
386  //evaluate sine index
387  index = (effective_index * (info->f0 + frequency_index * info->chirpness ) - phase_correction);
388 
389  output[offset].x = sinpi(((double)(index)/2147483647.5))*scale;
390  output[offset].y = -cospi(((double)(index)/2147483647.5))*scale;
391 
392  }
393 
394 }
396 
397  float2* __restrict__ output, //pointer to the gpu buffer
398  unsigned int output_size, //size of the buffer
399  chirp_parameter* __restrict__ info, //chirp information
400  unsigned long int last_index,
401  cudaStream_t internal_stream,
402  float scale = 1 //scale the amplitude of the chirp
403 
404  ){
405 
406  chirp_gen<<<1024,32,0,internal_stream>>>(output,output_size,info,last_index,scale);
407 
408 }
409 
410 
411 __global__ void chirp_demodulator(
412  float2* __restrict__ input, //pointer to the input buffer
413  float2* __restrict__ output, //pointer to the gpu buffer
414  unsigned int output_size, //size of the buffers
415  unsigned long int last_index,
416  chirp_parameter* __restrict__ info //chirp information
417  ){
418  unsigned long int effective_index; //index relative to the current position in signal generation (a single chirp may be generated in more than one kernel call).
419  unsigned long int frequency_index; //actual frequency step in the chirp generation.
420  unsigned long int phase_correction; //phase correction to allow parallel coherent phase generation.
421  int index; //index to use in sinus and cosinus.
422  float2 chirp; //calculate the chirp
423  for(unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x;
424  offset < output_size;
425  offset += gridDim.x*blockDim.x
426  ){
427 
428  //take in account previous calls and bookeep phase
429  effective_index = (last_index + offset) % (info->num_steps * info->length);
430  //effective_index = round_index(info->last_index,offset,info->num_steps,info->length);
431 
432  //calculate current frequency to generate
433  frequency_index = effective_index/info->length;
434 
435  unsigned long int q_phase = (frequency_index/2)*(frequency_index +1) + (frequency_index % 2)*((frequency_index +1)/2);
436 
437  //correct the pahse. needed for parallel chirp generation.
438  phase_correction = ( info->chirpness * (info->length * q_phase) );
439 
440  //evaluate sine index
441  index = (effective_index * (info->f0 + frequency_index * info->chirpness ) - phase_correction) ;
442 
443  chirp.x = sinpi(((double)(index)/2147483647.5));
444  chirp.y = -cospi(((double)(index)/2147483647.5));
445 
446  output[offset].x = chirp.x*input[offset].x + chirp.y*input[offset].y;
447  output[offset].y = chirp.x*input[offset].y - chirp.y*input[offset].x;
448  }
449 }
450 __device__ float absolute(float2 number){return sqrt(number.x*number.x+number.y+number.y);}
451 
453  float2* __restrict__ input, //pointer to the input buffer
454  float2* __restrict__ output, //pointer to the gpu buffer
455  unsigned int output_size, //size of the buffers
456  unsigned long int last_index,
457  chirp_parameter* __restrict__ info, //chirp information
458  cudaStream_t internal_stream
459  ){
460 
461  chirp_demodulator<<<1024,32,0,internal_stream>>>(input,output,output_size,last_index,info);
462 
463 }
464 
465 
466 __global__ void move_buffer(
467  float2* __restrict__ from,
468  float2* __restrict__ to,
469  int size,
470  int from_offset,
471  int to_offset
472  ){
473 
474  for(int offset = blockIdx.x * blockDim.x + threadIdx.x;
475  offset < size;
476  offset += gridDim.x*blockDim.x
477  ){
478  to[offset+to_offset] = from[offset+from_offset];
479  //if(absolute(from[offset])==0)printf("0 found in %d\n",offset);
480  }
481 }
483  float2* __restrict__ from,
484  float2* __restrict__ to,
485  int size,
486  int from_offset,
487  int to_offset,
488  cudaStream_t internal_stream
489  ){
490  move_buffer<<<1024,64,0,internal_stream>>>(from,to,size,from_offset,to_offset);
491 
492 }
493 
494 
495 //kernel used to apply the polyphase filter to a buffer using a window
496 __global__ void polyphase_filter(
497  float2* __restrict__ input,
498  float2* __restrict__ output,
499  filter_param* __restrict__ filter_info
500  ){
501 
502  //loop over the entire device buffer (last samples may be meaningless but this is accounted in the host loop)
503  for(int offset = blockIdx.x * blockDim.x + threadIdx.x;
504  offset < filter_info->batching * filter_info->n_tones;
505  offset += gridDim.x*blockDim.x
506  ){
507 
508  //check if there are enough samples for averaging
509  if(offset + filter_info->n_tones*(filter_info->average_buffer) < filter_info->batching * filter_info->n_tones){
510 
511  //accumulate average in an initialized register
512  float2 acc;
513  acc.x=0;
514  acc.y=0;
515 
516  //loop over the sample to average and add to accumulator. NOTE: the register acc is private to the thread.
517  for(int i = 0; i<(filter_info->average_buffer); i++ ){
518 
519  //calculate the index of the sample to average
520  int sample_index = offset + i* (filter_info->n_tones);
521 
522  //calculate the corresponding window sample
523  int win_index = offset%filter_info->n_tones+i*filter_info->n_tones;
524 
525  //apply the window and accumulate. NOTE the window is considered purely REAL
526  acc.x += input[sample_index].x * (filter_info->window)[win_index].x;
527  acc.y += input[sample_index].y * (filter_info->window)[win_index].x;
528  }
529 
530  //last averaging step NO because it's a normalized window
531  //acc.x = acc.x/filter_info->average_buffer;
532  //acc.y = acc.y/filter_info->average_buffer;
533 
534  //finally write the filtered sample to the output buffer
535  output[offset] = acc;
536  }
537  }
538 }
539 
541  float2* __restrict__ input,
542  float2* __restrict__ output,
543  filter_param* __restrict__ filter_info,
544  cudaStream_t internal_stream
545  ){
546 
547  polyphase_filter<<<896*2,64,0,internal_stream>>>(input,output,filter_info);
548 }
549 
550 
551 
552 //select the tones from the fft result and reorder them in a new buffer
553 __global__ void tone_select(
554  float2* __restrict__ input, //must be the fft output
555  float2* __restrict__ output,//the buffer that will then be downloaded to host
556  filter_param* __restrict__ filter_info, //information about the filtering process
557  int effective_batching //how many samples per tone have been effectively calculated
558  ){
559 
560  //filter_info->eff_n_tones is the number of 'selected' tones
561  //filter_info->n_tones is te number of fft bins
562  //effective_batching counts how many fft's are present in the buffer
563  //filter_info->tones has the information about which are the selected tones
564  for(int offset = blockIdx.x * blockDim.x + threadIdx.x;
565  offset < effective_batching*filter_info->eff_n_tones;
566  offset += gridDim.x*blockDim.x
567  ){
568 
569  //calculate from where to take the sample
570  int index = (offset/filter_info->eff_n_tones)*filter_info->n_tones + filter_info->tones[offset % filter_info->eff_n_tones];
571 
572  //write the sample in the output buffer
573  output[offset] = input[index];
574 
575  }
576 }
577 
579  float2* __restrict__ input, //must be the fft output
580  float2* __restrict__ output,//the buffer that will then be downloaded to host
581  filter_param* __restrict__ filter_info, //information about the filtering process
582  int effective_batching, //how many samples per tone have been effectively calculated
583  cudaStream_t internal_stream
584  ){
585 
586  tone_select<<<1024,64,0,internal_stream>>>(input,output,filter_info,effective_batching);
587 
588 }
589 
590 
591 //scale a float2 buffer for a float scalar
592 __global__ void scale_buffer(
593  float2* __restrict__ input,
594  int input_size,
595  float scale
596  ){
597 
598  //just loop over the array and multiply both component
599  for(int offset = blockIdx.x * blockDim.x + threadIdx.x;
600  offset < input_size;
601  offset += gridDim.x*blockDim.x
602  ){
603 
604  input[offset].x = input[offset].x * scale;
605  input[offset].y = input[offset].y * scale;
606  }
607 }
608 
609 //generate a set of tones and return host pointer to the buffer unless the device option is true.
610 //NOTE the length of the buffer is the sampling_rate
611 float2* tone_gen(
612  tone_parameters* info, //tone information (all host side)
613  int sampling_rate,
614  float scale, //scale the whole buffer (all tones) for a scalar
615  bool device //the function return device buffer pointer instead
616  ){
617 
618  //base for the fft. will be used as buffer recipient later.
619  float2* base_vector;
620  base_vector = (float2*)malloc(sampling_rate*sizeof(float2));
621 
622  //set the cuda fft plan
623  cufftHandle plan;
624  if (cufftPlan1d(&plan, sampling_rate, CUFFT_C2C, 1) != CUFFT_SUCCESS){
625  //print_error("Cannot allocate memory on the gpu for tone generation.");
626  std::cout<<"CUDA ERROR IN cufftHandle"<<std::endl;
627  return NULL;
628  }
629 
630  //device base for the fft. NOTE in place transformation applies.
631  float2* device_base;
632  cudaMalloc((void **)&device_base, sampling_rate*sizeof(float2));
633 
634  //accumulator for normalization of the buffer
635  float normalization = 0;
636 
637  //zero the host memory
638  for(int i = 0; i < sampling_rate; i++){
639  base_vector[i].x = 0;
640  base_vector[i].y = 0;
641  }
642 
643  //set the tones in the host base vector
644  for(int i = 0; i < info->tones_number; i++){
645 
646  //rotate frequencies in case of negative frequency offset
647  int freq;
648  info->tone_frquencies[i] > 0 ?
649  freq = info->tone_frquencies[i]:
650  freq = sampling_rate + info->tone_frquencies[i];
651 
652  //set the corresponding amplitude (NOTE this only work if fft_length == sampling_rate)
653  base_vector[freq].x = info->tones_amplitudes[i];// * std::cos(i * pi_f/info->tones_number);
654 
655  //all same phase for now (or distributed to avoid power spikes? NO?)
656  base_vector[freq].y = 0;//info->tones_amplitudes[i] * std::sin(i * pi_f/info->tones_number);
657 
658  //add tone amplitude to normalixation accumulator
659  normalization += info->tones_amplitudes[i];
660  }
661 
662  //finalize normaization coefficient calculation
663  //normalization = 1./normalization;
664 
665  //upload in the device the host base vector
666  cudaMemcpy(device_base, base_vector, sampling_rate*sizeof(float2),cudaMemcpyHostToDevice);
667 
668  //execute the inverse FFT transform
669  if (cufftExecC2C(plan, device_base, device_base, CUFFT_INVERSE) != CUFFT_SUCCESS){
670  //print_error("Cannot execute fft transform for tone generation.");
671  std::cout<<"CUDA ERROR: Cannot execute fft transform for tone generation."<<std::endl;
672  return NULL;
673  }
674 
675  //apply normalization to the device buffer
676  //scale_buffer<<<1024, 32>>>(device_base, sampling_rate, normalization);
677 
678  //if the user set a scale, apply scalar multiplication
679  if(scale>1.) std::cout<<"CUDA WARNING: Maximum amplitude of the TX buffer is > 1."<<std::endl;//print_warning("Maximum amplitude of the TX buffer is > 1.");
680  if(scale!=1.) scale_buffer<<<1024, 32>>>(device_base, sampling_rate, scale);
681 
682  //download the buffer from gpu to host
683  cudaMemcpy(base_vector,device_base,sampling_rate*sizeof(float2),cudaMemcpyDeviceToHost);
684 
685  //clean the GPU fft plan
686  cufftDestroy(plan);
687 
688  //if this option is true, the function returns the device pointer instead of the host pointer
689  if(device){
690 
691  //clean the host buffer
692  free(base_vector);
693 
694  //return device pointer
695  return device_base;
696 
697  }else{
698 
699  //clean the GPU buffer
700  cudaFree(device_base);
701 
702  //return the pointer
703  return base_vector;
704 
705  }
706 }
707 
708 //overlap two device buffer in one device buffer.
709 __global__ void mix_buffers(
710  float2* __restrict__ buffer1,
711  float2* __restrict__ buffer2,
712  float2* __restrict__ output,
713  int length
714  ){
715  //loop over the buffers.
716  for(int offset = blockIdx.x * blockDim.x + threadIdx.x;
717  offset < length;
718  offset += gridDim.x*blockDim.x
719  ){
720  output[offset].x = buffer1[offset].x + buffer2[offset].x;
721  output[offset].y = buffer1[offset].y + buffer2[offset].y;
722  }
723 }
724 
725 
726 __global__ void average_spectra(
727  float2* __restrict__ input,
728  float2* __restrict__ output,
729  int decim,
730  int nfft,
731  int input_len
732 
733  ){
734  for(int offset = blockIdx.x * blockDim.x + threadIdx.x;
735  offset < input_len;
736  offset += gridDim.x*blockDim.x
737  ){
738 
739  int output_offset = offset%nfft + nfft*int(offset/(nfft*decim));
740  atomicAdd(&output[output_offset].x, input[offset].x);
741  atomicAdd(&output[output_offset].y, input[offset].y);
742 
743 
744  }
745 
746 }
747 
749  float2* __restrict__ input, //output of the pfb
750  float2* __restrict__ output,//decimated output
751  int decim, //decimation factor (multiplicative to the pfb one)
752  int nfft, //length of the fft
753  int input_len, //could be calculated inside but I wrote an apposite class for it
754  int output_len,
755  cudaStream_t stram_f //stream on which to launch the decimator
756  ){
757 
758  //input len must be chopped to the exact amount of data
759 
760 
761  average_spectra<<<1024, 32, 0, stram_f>>>(
762  input,
763  output,
764  decim,
765  nfft,
766  input_len
767  );
768 
769  scale_buffer<<<1024, 32>>>(output, output_len, 1./decim);
770 
771 }
772 
773 //decimate the output of the fft without tone selection
774 //NOTE: this thread has to be launched from its wrapper or witha Nblocks*Nthreads == out_len and
775 //it is not protected from accessing data outside input_len (see wrapper)
776 __global__ void accumulate_ffts(
777  float2* __restrict__ input,
778  float2* __restrict__ output,
779  int decim,
780  int nfft,
781  int output_length
782  ){
783 
784  //declare and initialize some shared memory
785  extern __shared__ float2 shared_buffer[];
786 
787  //accumulate samples in shared memory iterating on the output
788  for(int offset = blockIdx.x * blockDim.x + threadIdx.x;
789  offset < output_length;
790  offset += gridDim.x*blockDim.x
791  ){
792 
793  //there is a float2 per thread needed as accumulator
794  shared_buffer[threadIdx.x].x = 0;
795  shared_buffer[threadIdx.x].y = 0;
796 
797  int input_index;
798  //this iterate over the samples to accumulate
799  for(int j = 0; j < decim; j++){
800 
801  input_index = j * (offset % nfft);
802 
803  //accumulate samples
804  shared_buffer[threadIdx.x].x += input[input_index].x;
805  shared_buffer[threadIdx.x].y += input[input_index].y;
806  }
807 
808  //copy back the memory in the output
809  output[offset].x = shared_buffer[threadIdx.x].x/decim;
810  output[offset].y = shared_buffer[threadIdx.x].y/decim;
811  }
812 }
813 
814 
815 __global__ void zero_mem(float2* __restrict__ input, int input_len, float value){
816  for(int offset = blockIdx.x * blockDim.x + threadIdx.x;
817  offset < input_len;
818  offset += gridDim.x*blockDim.x
819  ){
820  input[offset].x = value;
821  input[offset].y = 0;
822  //if(offset == 0) printf("Using zeromem!!\n");
823  }
824 
825 }
826 
827 __device__ float magnitude(float2 sample){
828  return sqrt(sample.x*sample.x+sample.y*sample.y);
829 }
830 
831 
832 
833 //build_vna_profile()
834 #ifdef CUBLAS_API_H_
835 // cuBLAS API errors
836 void _cudaGetErrorEnum(cublasStatus_t error)
837 {
838  switch (error)
839  {
840 
841  case CUBLAS_STATUS_NOT_INITIALIZED:
842  //print_error("CUBLAS_STATUS_NOT_INITIALIZED");
843  std::cout<<"CUBLAS ERROR: CUBLAS_STATUS_NOT_INITIALIZED"<<std::endl;
844  break;
845  case CUBLAS_STATUS_ALLOC_FAILED:
846  //print_error( "CUBLAS_STATUS_ALLOC_FAILED");
847  std::cout<<"CUBLAS ERROR: CUBLAS_STATUS_ALLOC_FAILED"<<std::endl;
848  break;
849  case CUBLAS_STATUS_INVALID_VALUE:
850  //print_error( "CUBLAS_STATUS_INVALID_VALUE");
851  std::cout<<"CUBLAS ERROR: CUBLAS_STATUS_INVALID_VALUE"<<std::endl;
852  break;
853  case CUBLAS_STATUS_ARCH_MISMATCH:
854  //print_error( "CUBLAS_STATUS_ARCH_MISMATCH");
855  std::cout<<"CUBLAS ERROR: CUBLAS_STATUS_ARCH_MISMATCH"<<std::endl;
856  break;
857  case CUBLAS_STATUS_MAPPING_ERROR:
858  //print_error( "CUBLAS_STATUS_MAPPING_ERROR");
859  std::cout<<"CUBLAS ERROR: CUBLAS_STATUS_MAPPING_ERROR"<<std::endl;
860  break;
861  case CUBLAS_STATUS_EXECUTION_FAILED:
862  //print_error( "CUBLAS_STATUS_EXECUTION_FAILED");
863  std::cout<<"CUBLAS ERROR: CUBLAS_STATUS_EXECUTION_FAILED"<<std::endl;
864  break;
865  case CUBLAS_STATUS_INTERNAL_ERROR:
866  //print_error( "CUBLAS_STATUS_INTERNAL_ERROR");
867  std::cout<<"CUBLAS ERROR: UBLAS_STATUS_INTERNAL_ERROR"<<std::endl;
868  break;
869  }
870 }
871 #endif
872 
873 // Used in the VNA as lock-in decimator.
875  float2* __restrict__ input,
876  float2* __restrict__ output,
877  float2* __restrict__ profile,
878  cuComplex* __restrict__ zero,
879  cuComplex* __restrict__ one,
880  int ppt,
881  int n_freqs,
882  cublasHandle_t* __restrict__ handle
883  ){
884 
885 
886  cublasStatus_t err = cublasCgemv(*handle, CUBLAS_OP_T,
887  n_freqs, ppt,
888  one,
889  input, n_freqs,
890  profile, 1,
891  zero,
892  output, 1);
893  _cudaGetErrorEnum(err);
894 }
895 
896 
897 //wrapper for the previous fft decimation function. decimates the pfb output.
898 //NOTE: this function does not take care of the reminder and suppose that calculation
899 //to determine the output_length has already been externally done.
901  float2* __restrict__ input, //output of the pfb
902  float2* __restrict__ output,//decimated output
903  int decim, //decimation factor (multiplicative to the pfb one)
904  int nfft, //length of the fft
905  int output_length, //could be calculated inside but I wrote an apposite class for it
906  cudaStream_t stram_f //stream on which to launch the decimator
907  ){
908 
909  //the number of blocks can variate as the nuber of valid batches changes
910  int blocks = std::ceil(output_length/PFB_DECIM_TPB);
911  accumulate_ffts<<<blocks, PFB_DECIM_TPB, PFB_DECIM_TPB*sizeof(float2), stram_f>>>(
912  input,output,decim,nfft,output_length);
913 }
914 
915 
916 
918  double2* __restrict__ input,
919  double2* __restrict__ output,
920  double2* __restrict__ profile,
921  cuDoubleComplex* __restrict__ zero,
922  cuDoubleComplex* __restrict__ one,
923  int ppt,
924  int n_freqs,
925  cublasHandle_t* __restrict__ handle
926  ){
927 
928 
929  cublasStatus_t err = cublasZgemv(*handle, CUBLAS_OP_T,
930  n_freqs, ppt,
931  one,
932  input, n_freqs,
933  profile, 1,
934  zero,
935  output, 1);
936  _cudaGetErrorEnum(err);
937 }
938 __global__ void double2float(
939  double2* __restrict__ input, //from float
940  float2* __restrict__ output,//to double
941  int length
942  ){
943 
944  for(int offset = blockIdx.x * blockDim.x + threadIdx.x;
945  offset < length;
946  offset += gridDim.x*blockDim.x
947  ){
948  output[offset].x = input[offset].x;
949  output[offset].y = input[offset].y;
950  }
951 }
952 
953 __global__ void float2double(
954  float2* __restrict__ input, //from float
955  double2* __restrict__ output,//to double
956  int length
957  ){
958 
959  for(int offset = blockIdx.x * blockDim.x + threadIdx.x;
960  offset < length;
961  offset += gridDim.x*blockDim.x
962  ){
963  output[offset].x = input[offset].x;
964  output[offset].y = input[offset].y;
965  }
966 }
__device__ unsigned int round_index(unsigned int last_index, unsigned int offset, unsigned int num_f, unsigned int f_len)
Definition: kernels.cu:343
__global__ void scale_buffer(float2 *__restrict__ input, int input_size, float scale)
Definition: kernels.cu:592
__global__ void float2double(float2 *__restrict__ input, double2 *__restrict__ output, int length)
Definition: kernels.cu:953
__global__ void make_rand(curandState *state, float2 *vector, int len, float scale=1)
Definition: kernels.cu:323
void chirp_demodulator_wrapper(float2 *__restrict__ input, float2 *__restrict__ output, unsigned int output_size, unsigned long int last_index, chirp_parameter *__restrict__ info, cudaStream_t internal_stream)
Definition: kernels.cu:452
void D_cublas_decim(double2 *__restrict__ input, double2 *__restrict__ output, double2 *__restrict__ profile, cuDoubleComplex *__restrict__ zero, cuDoubleComplex *__restrict__ one, int ppt, int n_freqs, cublasHandle_t *__restrict__ handle)
Definition: kernels.cu:917
int * tone_frquencies
Definition: kernels.cuh:69
void decimate_pfb(float2 *__restrict__ input, float2 *__restrict__ output, int decim, int nfft, int output_length, cudaStream_t stram_f)
Definition: kernels.cu:900
void tone_select_wrapper(float2 *__restrict__ input, float2 *__restrict__ output, filter_param *__restrict__ filter_info, int effective_batching, cudaStream_t internal_stream)
Definition: kernels.cu:578
Descriptor of the mutitone generation.
Definition: kernels.cuh:67
__global__ void direct_demodulator_fp64(double *__restrict tone_frquencies, size_t index_counter, uint single_tone_length, size_t total_length, float2 *__restrict intput, float2 *__restrict output)
Direct demodulation kernel. This kernel takes the raw input from the SDR and separate channels...
Definition: kernels.cu:11
__global__ void double2float(double2 *__restrict__ input, float2 *__restrict__ output, int length)
Definition: kernels.cu:938
__device__ float modulus(float number, float modulus)
Definition: kernels.cu:339
float2 * make_flat_window(int length, int side, bool diagnostic)
Creates a flattop window in the GPU memory.
Definition: kernels.cu:208
__global__ void direct_demodulator_integer(int *__restrict tone_frequencies, int *__restrict tone_phases, int wavetablelen, size_t index_counter, size_t single_tone_length, size_t total_length, float2 *__restrict input, float2 *__restrict output)
Integer version of the direct demodulation kernel (numerically more stable). This kernel takes the ra...
Definition: kernels.cu:45
__global__ void tone_select(float2 *__restrict__ input, float2 *__restrict__ output, filter_param *__restrict__ filter_info, int effective_batching)
Definition: kernels.cu:553
float2 * make_sinc_window(int length, float fc, bool diagnostic=false, bool host_ret=false)
Definition: kernels.cu:258
__global__ void zero_mem(float2 *__restrict__ input, int input_len, float value)
Definition: kernels.cu:815
float2 * tone_gen(tone_parameters *info, int sampling_rate, float scale, bool device)
Definition: kernels.cu:611
__global__ void DIRECT_decimator(uint single_tone_length, size_t total_length, float2 *__restrict intput, float2 *__restrict output)
__global__ void accumulate_ffts(float2 *__restrict__ input, float2 *__restrict__ output, int decim, int nfft, int output_length)
Definition: kernels.cu:776
#define PFB_DECIM_TPB
Tune the Thread Per Block used in certain functions.
Definition: kernels.cuh:41
void move_buffer_wrapper(float2 *__restrict__ from, float2 *__restrict__ to, int size, int from_offset, int to_offset, cudaStream_t internal_stream)
Definition: kernels.cu:482
void polyphase_filter_wrapper(float2 *__restrict__ input, float2 *__restrict__ output, filter_param *__restrict__ filter_info, cudaStream_t internal_stream)
Definition: kernels.cu:540
T * make_hamming_window(int length, int side, bool diagnostic, bool host_ret)
Definition: kernels.cu:107
void print_chirp_params(std::string comment, chirp_parameter cp)
Definition: kernels.cu:335
float * tones_amplitudes
Definition: kernels.cuh:70
__global__ void move_buffer(float2 *__restrict__ from, float2 *__restrict__ to, int size, int from_offset, int to_offset)
Definition: kernels.cu:466
void chirp_gen_wrapper(float2 *__restrict__ output, unsigned int output_size, chirp_parameter *__restrict__ info, unsigned long int last_index, cudaStream_t internal_stream, float scale=1)
Definition: kernels.cu:395
__device__ float magnitude(float2 sample)
Definition: kernels.cu:827
__global__ void init_states(curandState *state, int twice_vector_len)
Definition: kernels.cu:312
float2 * make_hamming_window< float2 >(int length, int side, bool diagnostic, bool host_ret)
Definition: kernels.cu:155
__global__ void chirp_demodulator(float2 *__restrict__ input, float2 *__restrict__ output, unsigned int output_size, unsigned long int last_index, chirp_parameter *__restrict__ info)
Definition: kernels.cu:411
__global__ void mix_buffers(float2 *__restrict__ buffer1, float2 *__restrict__ buffer2, float2 *__restrict__ output, int length)
Definition: kernels.cu:709
__global__ void chirp_gen(float2 *__restrict__ output, unsigned int output_size, chirp_parameter *__restrict__ info, unsigned long int last_index, float scale=1)
Definition: kernels.cu:355
void direct_demodulator_wrapper(int *__restrict tone_frequencies, int *__restrict tone_phases, int wavetablelen, size_t index_counter, size_t single_tone_length, size_t total_length, float2 *__restrict input, float2 *__restrict output, cudaStream_t internal_stream)
Definition: kernels.cu:89
__global__ void average_spectra(float2 *__restrict__ input, float2 *__restrict__ output, int decim, int nfft, int input_len)
Definition: kernels.cu:726
#define pi_f
Definition: kernels.cuh:34
__global__ void polyphase_filter(float2 *__restrict__ input, float2 *__restrict__ output, filter_param *__restrict__ filter_info)
Definition: kernels.cu:496
__device__ float absolute(float2 number)
Definition: kernels.cu:450
void decimate_spectra(float2 *__restrict__ input, float2 *__restrict__ output, int decim, int nfft, int input_len, int output_len, cudaStream_t stram_f)
Definition: kernels.cu:748
void cublas_decim(float2 *__restrict__ input, float2 *__restrict__ output, float2 *__restrict__ profile, cuComplex *__restrict__ zero, cuComplex *__restrict__ one, int ppt, int n_freqs, cublasHandle_t *__restrict__ handle)
Definition: kernels.cu:874