USRP_Server  2.0
A flexible, GPU-accelerated radio-frequency readout software.
USRP_demodulator.cpp
Go to the documentation of this file.
1 #include "USRP_demodulator.hpp"
2 #include "fir.hpp"
3 #define checkcublas(X) assert( ( X ) == CUBLAS_STATUS_SUCCESS )
4 
5 //initialization: the parameters are coming directly from the client (from the async communication thread)
6 //diagnostic allows to print the window on a binary file and stores some diagnostic information //TODO on a hdf5 file??
7 RX_buffer_demodulator::RX_buffer_demodulator(param* init_parameters, bool init_diagnostic){
8 
9  //enable or disable information diagnostic
10  diagnostic = init_diagnostic;
11 
12  //store initialization parameters
13  parameters = init_parameters;
14 
15  //check what kind of demodulator has been requested
16  bool mixed_buffer_type = false;
17  int chirp_counter = 0;
18 
19  w_type last_w_type = NODSP;
20 
21  //wrapper around the parameters (it could be empty: see default behaviour)
22  try{last_w_type = parameters->wave_type.at(0);}catch(const std::out_of_range& e){
23  if(diagnostic)print_warning("No signal processing options found. Transmitting full buffer.");
24  last_w_type = NODSP;
25  }
26 
27  for(size_t i = 0; i < parameters->wave_type.size(); i++){
28  if(parameters->wave_type[i]!=last_w_type)mixed_buffer_type = true;
29  if(parameters->wave_type[i]==CHIRP)chirp_counter++;
30  }
31  if(chirp_counter>1){
32  print_error("Multiple chirp RX buffer demodulation has been requested. This feature is not implemented yet.");
33  exit(-1);
34  }
35  // In the future this can be removed and different tones generator can be used to compose a single signal.
36  if(mixed_buffer_type){
37  print_error("Mixed RX buffer demodulation has been requested. This feature is not implemented yet.");
38  exit(-1);
39  }
40 
41  //create an high priority stream (allow to overlap TX and RX operation on same GPU)
42  int low_p,high_p;
43  cudaDeviceGetStreamPriorityRange ( &low_p, &high_p );
44  cudaStreamCreateWithPriority(&internal_stream,cudaStreamNonBlocking, low_p);
45 
46  //needed only in case of buffer mismamtch
47  int in_out_len = 0;
48 
49  //if there is a valid decimation enable the decimator
50  decimator_active = (parameters->decim > 0)?true:false;
51  //if(decimator_active)std::cout<<"Decimator is active: "<<parameters->decim <<std::endl;
52  //print_debug("decimator is active?",decimator_active);
53  //initialize the memory pointer for eventual spare buffers
54  spare_size = 0;
55 
56  switch(last_w_type){
57  // Insert new DSP cases below!
58 
59  case DIRECT:
60  if(diagnostic)print_warning("Demodulator diagnostic enabled.");
61  // Initialization code!
62 
63  // Create the device vector of frequencies
64  cudaMalloc((void **)&DIRECT_tone_frquencies, parameters->wave_type.size()*sizeof(int));
65 
66  //No care for phase yet
67  cudaMalloc((void **)&DIRECT_tone_phases, parameters->wave_type.size()*sizeof(int));
68  cudaMemset(DIRECT_tone_phases, 0, parameters->wave_type.size()*sizeof(int));
69 
70  DIRECT_tones = (int*)malloc(parameters->wave_type.size()*sizeof(int));
71  for(uint k=0; k<parameters->wave_type.size(); k++ ){
72  DIRECT_tones[k] = parameters->freq[k];
73  }
74 
75  // Upload the frequency vector
76  cudaMemcpy(DIRECT_tone_frquencies, DIRECT_tones, parameters->freq.size() * sizeof(int),cudaMemcpyHostToDevice);
77 
78  // Allocate input memory
79  cudaMalloc((void **)&direct_input,parameters->buffer_len*sizeof(float2));
80 
81  //Optimization: calculate this once.
82  DIRECT_output_size = parameters->buffer_len * parameters->wave_type.size();
83 
84  // Allocate output memory
85  cudaMalloc((void **)&direct_output, DIRECT_output_size*sizeof(float2));
86 
87  //Initialize bookeeping
88  DIRECT_current_index = 0;
89 
90  //Attach the demodulator handle to the stream.
91  cublasCreate(&handle);
92  cublasSetStream(handle,internal_stream);
93 
94  DIRECT_FIR_output_size = DIRECT_output_size;
95 
96  if(parameters->decim > 0){
97 
98  //Initialize FIR taps
99  fir_taps = make_sinc_window(parameters->decim * parameters->pf_average, 0.75/(parameters->decim*2), false, true);
100  //for (int y = 0; )
101  //make_hamming_window<float2>(parameters->buffer_len, 0, false, true)
102 
103  DIRECT_FIR_output_size /= parameters->decim;
104  cudaMalloc((void **)&FIR_output, DIRECT_FIR_output_size*sizeof(float2));
105  //Initialize FIR
106  //NOTE: the FIR will work on one channel at time
107  DIRECT_FIR = (FIR**)malloc(sizeof(FIR*)*parameters->wave_type.size());
108 
109  //could make use of multiple streams
110  for(size_t k=0; k<parameters->wave_type.size(); k++)DIRECT_FIR[k] = new FIR(handle, internal_stream, fir_taps, parameters->decim, parameters->pf_average, parameters->buffer_len);
111  }
112 
113  cudaMalloc((void **)&transposed, DIRECT_FIR_output_size*sizeof(float2));
114 
115  //declare pointers to functions
116  process_ptr = &RX_buffer_demodulator::process_direct;
117  clr_ptr = &RX_buffer_demodulator::close_direct;
118 
119  break;
120 
121  case TONES:
122 
123  //warn the user about the diagnostic
124  if(diagnostic)print_warning("Demodulator diagnostic enabled.");
125 
126  //assign the correct cleaning and process function pointer
127  process_ptr = &RX_buffer_demodulator::process_pfb;
128  clr_ptr = &RX_buffer_demodulator::close_pfb;
129 
130  //the cut-off frequency is calculated in function of the number of tones
131  fcut = 1./(2*parameters->fft_tones);
132 
133  //calculate the window and stores it in gpu memory. NOTE: this is done on the default stream.
134  window = make_sinc_window(parameters->fft_tones*parameters->pf_average, fcut, diagnostic,false);
135 
136  //convert the parameter struct to a gpu version for the kernels and upload it.
137  upload_multitone_parameters();
138 
139  //length of the gpu buffer for applying the polyphase filter bank
140  in_out_len = parameters->fft_tones * batching;
141 
142  //allocates needed space. for description see the pointers.
143  cudaMalloc((void **)&raw_input,in_out_len*sizeof(float2));
144  cudaMalloc((void **)&input,in_out_len*sizeof(float2));
145 
146  cudaMalloc((void **)&output,(decimator_active?3:1)*in_out_len*sizeof(float2));
147  cudaMalloc((void **)&reduced_output,parameters->wave_type.size()*batching*sizeof(float2));
148 
149  //crate the fft plan
150  cufftPlanMany(&plan,1,&parameters->fft_tones,
151  NULL,1,parameters->fft_tones,
152  NULL,1,parameters->fft_tones,
153  CUFFT_C2C, batching);
154 
155  //set the fft on the priority stream
156  cufftSetStream(plan, internal_stream);
157 
158  //initialize the buffer manager
159  buf_setting = new buffer_helper(parameters->fft_tones, parameters->buffer_len, parameters->pf_average, parameters->wave_type.size());
160 
161  //if the user asks for decimation extra space is needed.
162  if(decimator_active){
163  //TODO Currently I'm decimating the full fft buffer when only the "selected tones" should be decimated.
164  //This require an apposite buffer copy kernel due to the structure of the selected tones memory.
165 
166  //the size used for this buffer it's bigger than expectes because the input size could variate.
167  cudaMalloc((void **)&decim_output,sizeof(float2)*(2.*parameters->buffer_len)/(parameters->decim));
168 
169  //initialize the post demodulator decimation buffer manager
170  pfb_decim_helper = new pfb_decimator_helper(parameters->decim, parameters->fft_tones);
171 
172  print_warning("When using TONES demodulation type, the decimation should be achieved increasing the number of pfb channels");
173  }
174 
175  break;
176 
177  case CHIRP: //valid only for single chirp
178 
179  //warn the user about the diagnostic
180  if(diagnostic)print_warning("Demodulator diagnostic enabled.");
181 
182  //set the correct function pointers for get() and close() methods
183  process_ptr = &RX_buffer_demodulator::process_chirp;
184  clr_ptr = &RX_buffer_demodulator::close_chirp;
185 
186  //create an high priority stream (allow to overlap TX and RX operation on same GPU)
187  int low_p,high_p;
188  cudaDeviceGetStreamPriorityRange ( &low_p, &high_p );
189  cudaStreamCreateWithPriority(&internal_stream,cudaStreamNonBlocking, high_p);
190 
191 
192  h_parameter.num_steps = parameters->swipe_s[0];
193  if(h_parameter.num_steps<1){
194  print_warning("Number of frequency steps of the chirp demodulator is not set. Setting it to maximum (chirp time * sampling rate).");
195  h_parameter.num_steps = parameters->chirp_t[0] * parameters->rate;
196  }
197  if(h_parameter.num_steps<2){
198  print_warning("Number of frequency steps of the chirp demodulator is less than 2. This may result in single tone demodulation.");
199  }
200 
201  //how long each tone is in samples
202  h_parameter.length = parameters->chirp_t[0] * parameters->rate / h_parameter.num_steps;
203  if(h_parameter.length<1){
204  print_warning("Duration of each frequency in chirp signal cannot be less than one sample. Setting duration of each tone to 1.");
205  h_parameter.length = 1;
206  }
207 
208  //the chirpness is expressed as double this expression somewhere
209  //h_parameter.chirpness = ((float)(parameters->chirp_f[0]-parameters->freq[0])/((float)h_parameter.num_steps-1.))/(float)parameters->rate;
210  h_parameter.chirpness = ((std::pow(2,32)-1)*(parameters->chirp_f[0]-parameters->freq[0])/((double)h_parameter.num_steps-1.))/(double)parameters->rate;
211 
212  //the algorithm used for chirp generation use this value as frequency offset
213  //h_parameter.f0 = (float)parameters->freq[0]/(float)parameters->rate;
214  h_parameter.f0 = (std::pow(2,32)-1) * ((double)parameters->freq[0]/(double)parameters->rate);
215 
216  //bookeeping updated on CPU
217  last_index = 0;
218 
219  //upload the parameter struct to the gpu
220  cudaMalloc((void **)&d_parameter,sizeof(chirp_parameter));
221  cudaMemcpy(d_parameter, &h_parameter, sizeof(chirp_parameter),cudaMemcpyHostToDevice);
222 
223  //allocte memory for the kernel operations
224  cudaMalloc((void **)&input,sizeof(float2)*parameters->buffer_len);
225  cudaMalloc((void **)&output,(decimator_active?3:1)*sizeof(float2)*parameters->buffer_len);
226 
227  //if the user asks for decimation extra space is needed.
228  if(decimator_active){
229 
230  //set the decimator parameter
231  ppt = h_parameter.length * parameters->decim;
232 
233  if(parameters->decim>1)print_warning("A decimation factor >1 requested in chirp demodulation. There is interpreted as ppt*decim");
234 
235  vna_helper = new VNA_decimator_helper(ppt, parameters->buffer_len);
236 
237  //initialize handle for cublas ops
238  cublasCreate(&handle);
239  cublasSetStream(handle,internal_stream);
240 
241 
242  zero = make_cuComplex (0.0f, 0.0f);
243  one = make_cuComplex (1.0f, 0.0f);
244 
245  //creates a profile for filtering and decimation
246  profile = make_flat_window(ppt,ppt/10,false);
247 
248  //profile = make_sinc_window(ppt,0.02);
249  //scale_buffer<<<1024,32>>>(profile,ppt,1./parameters->decim);
250  //do not discard any point
251  side = 0;
252  //the size used for this buffer it's bigger than expectes because the input size could variate.
253  cudaMalloc((void **)&decim_output,sizeof(float2)*std::ceil((float)parameters->buffer_len/(float)ppt)*8);
254 
255  //initialize the post demodulator decimation buffer manager
256  //decim_helper = new gp_decimator_helper(parameters->buffer_len,parameters->decim);
257 
258  }
259 
260  //print_chirp_params("RX",h_parameter);
261 
262  break;
263 
264  case NOISE: //returns the full spectrum result without selecting tones.
265 
266  //warn the user about the diagnostic
267  if(diagnostic)print_warning("Demodulator diagnostic enabled.");
268 
269  //assign the correct cleaning and process function pointer
270  process_ptr = &RX_buffer_demodulator::process_pfb_spec;
271  clr_ptr = &RX_buffer_demodulator::close_pfb_spec;
272 
273  //the cut-off frequency is calculated in function of the number of tones
274  fcut = 1./(2*parameters->fft_tones);
275 
276  //calculate the window and stores it in gpu memory. NOTE: this is done on the default stream.
277  window = make_sinc_window(parameters->fft_tones*parameters->pf_average, fcut, diagnostic, false);
278 
279  //convert the parameter struct to a gpu version for the kernels and upload it.
280  upload_multitone_parameters();
281 
282  //length of the gpu buffer for applying the polyphase filter bank
283  in_out_len = parameters->fft_tones * batching;
284 
285  //allocates needed space. for description see the pointers.
286  cudaMalloc((void **)&raw_input,in_out_len*sizeof(float2));
287  cudaMalloc((void **)&input,(decimator_active?3:1)*in_out_len*sizeof(float2));
288  cudaMalloc((void **)&output,in_out_len*sizeof(float2));
289 
290 
291  //crate the fft plan
292  cufftPlanMany(&plan,1,&parameters->fft_tones,
293  NULL,1,parameters->fft_tones,
294  NULL,1,parameters->fft_tones,
295  CUFFT_C2C, batching);
296 
297  //set the fft on the priority stream
298  cufftSetStream(plan, internal_stream);
299 
300  //initialize the buffer manager
301  buf_setting = new buffer_helper(parameters->fft_tones, parameters->buffer_len, parameters->pf_average, parameters->fft_tones);
302 
303  if(decimator_active){
304 
305  //the size used for this buffer it's bigger than expectes because the input size could variate.
306  //cudaMalloc((void **)&decim_output,sizeof(float2)*(2*parameters->buffer_len)/(parameters->decim));
307  cudaMalloc((void **)&decim_output,sizeof(float2)*(2*parameters->buffer_len));
308 
309  //initialize the post demodulator decimation buffer manager
310  pfb_decim_helper = new pfb_decimator_helper(parameters->decim, parameters->fft_tones);
311  }
312 
313  break;
314 
315  case NODSP:
316 
317  //this case is a pass through.
318  process_ptr = &RX_buffer_demodulator::process_nodsp;
319  clr_ptr = &RX_buffer_demodulator::close_nodsp;
320 
321  break;
322  default: //the default case means that no wave type has been selected. will transmit the full buffer. //TODO
323  print_error("Void demodulation operation has not been implemented yet!");
324  exit(-1);
325  break;
326  }
327 }
328 
329 //wrapper to the correct get function
330 int RX_buffer_demodulator::process(float2** __restrict__ in, float2** __restrict__ out){ return (this->*process_ptr)(in,out); }
331 
332 //wrapper to the correct cleaning function
333 void RX_buffer_demodulator::close(){ (this->*clr_ptr)(); }
334 
335 int RX_buffer_demodulator::process_nodsp(float2** __restrict__ input_buffer, float2** __restrict__ output_buffer){
336  //TODO: a bypass on demodulator call would be more efficient
337  std::memcpy(*output_buffer, *input_buffer, parameters->buffer_len*sizeof(float2));
338  return parameters->buffer_len;
339 }
340 
341 //process a packet demodulating with chirp
342 int RX_buffer_demodulator::process_chirp(float2** __restrict__ input_buffer, float2** __restrict__ output_buffer){
343 
344  int valid_size;
345 
346  //upload the input buffer
347  cudaMemcpyAsync(input, *input_buffer, parameters->buffer_len*sizeof(float2),cudaMemcpyHostToDevice, internal_stream);
348 
349  //zero_mem<<<1024,64,0,internal_stream>>>(input,parameters->buffer_len,1);
350 
351  //demodulate the chirp signal. NOTE: the index bookeeping is done on the device
352  chirp_demodulator_wrapper(input,output+spare_size,parameters->buffer_len,last_index,d_parameter,internal_stream);
353 
354  //update bookeeping index
355  last_index = (last_index + parameters->buffer_len) % (h_parameter.num_steps * h_parameter.length);
356 
357 
358  //gpuErrchk( cudaStreamSynchronize(internal_stream) );
359  if(decimator_active){
360 
361  valid_size = vna_helper->valid_size;
362 
363  cublas_decim(output,decim_output,profile,&zero,&one,valid_size,ppt, &handle);
364  //gpuErrchk( cudaStreamSynchronize(internal_stream) );
365 
366  //download the result on the host
367  cudaMemcpyAsync(*output_buffer, decim_output, sizeof(float2)*valid_size, cudaMemcpyDeviceToHost, internal_stream);
368 
369  spare_size = vna_helper->new0;
370 
371  //gpuErrchk( cudaStreamSynchronize(internal_stream) );
372  //move the part of the buffer that has already to be analyzed at the begin of the buffer
373  if(spare_size > 0){
375  output, //from
376  output, //to
377  vna_helper->new0, //size
378  vna_helper->spare_begin,0,
379  internal_stream); // destination offset
380  }
381 
382  vna_helper->update();
383 
384  }else{
385 
386  //download back to the host
387  cudaMemcpyAsync(*output_buffer, output, sizeof(float2)*parameters->buffer_len, cudaMemcpyDeviceToHost, internal_stream);
388  //cudaMemcpyAsync(*output_buffer, input, sizeof(float2)*parameters->buffer_len, cudaMemcpyDeviceToHost, internal_stream);
389  //set the valid putput size
390  valid_size = parameters->buffer_len;
391  }
392 
393  cudaStreamSynchronize(internal_stream);
394  //wait for operation to be completed before returning
395 
396  return valid_size;
397 }
398 
399 // @todo this function has to be splitted in decimated and undecimated case. branching is not ok here.
400 int RX_buffer_demodulator::process_direct(float2** __restrict__ input_buffer, float2** __restrict__ output_buffer){
401  int ret;
402  size_t output_channel_len = parameters->buffer_len/std::max((int)(parameters->decim),1);
403  //Load the memory
404  cudaMemcpyAsync(direct_input, *input_buffer, parameters->buffer_len*sizeof(float2),cudaMemcpyHostToDevice, internal_stream);
405  //std::cout<< "total length: "<<DIRECT_output_size<<" single tone length: "<<parameters->buffer_len <<std::endl;
406  //Call the kernel
408  DIRECT_tone_frquencies,
409  DIRECT_tone_phases,
410  parameters->rate,
411  DIRECT_current_index,
412  parameters->buffer_len,
413  DIRECT_output_size,
414  direct_input,
415  direct_output,
416  internal_stream
417  );
418 
419  //Apply FIR filtering to each channel
420  if (parameters->decim>0){
421  for(size_t i = 0; i< parameters->wave_type.size(); i++) DIRECT_FIR[i]->run_fir((direct_output)+(i*parameters->buffer_len), (FIR_output)+i*output_channel_len);
422  checkcublas(cublasCgeam(
423  handle,
424  CUBLAS_OP_T, //op A
425  CUBLAS_OP_N, //op B
426  parameters->wave_type.size(), //m
427  output_channel_len, //n
428  &onef, //aplha
429  FIR_output,output_channel_len, //A
430  &zerof, //beta
431  direct_input,output_channel_len, //B not important as beta == 0
432  transposed,(int)parameters->wave_type.size() //C
433  ));
434  }
435 
436  //Update bookeeping
437  DIRECT_current_index+=parameters->buffer_len;
438 
439  //Numerically control this value: if too big adds noise in _sinf()
440  DIRECT_current_index = DIRECT_current_index % parameters->rate;
441 
442  if (parameters->decim<=0){
443  //cudaMemcpyAsync(*output_buffer, direct_output, sizeof(float2)*DIRECT_output_size, cudaMemcpyDeviceToHost, internal_stream);
444  checkcublas(cublasCgeam(
445  handle,
446  CUBLAS_OP_T, //op A
447  CUBLAS_OP_N, //op B
448  parameters->wave_type.size(), //m
449  parameters->buffer_len, //n
450  &onef, //aplha
451  direct_output,parameters->buffer_len, //A
452  &zerof, //beta
453  direct_input,parameters->buffer_len, //B not important as beta == 0
454  transposed,(int)parameters->wave_type.size() //C
455  ));
456  cudaMemcpyAsync(*output_buffer, transposed, sizeof(float2)*DIRECT_output_size, cudaMemcpyDeviceToHost, internal_stream);
457  ret = DIRECT_output_size;
458  }else{
459  ret = DIRECT_output_size/parameters->decim;
460  cudaMemcpyAsync(*output_buffer, transposed, sizeof(float2)*ret, cudaMemcpyDeviceToHost, internal_stream);
461  }
462  cudaStreamSynchronize(internal_stream);
463  return ret;
464 }
465 
466 void RX_buffer_demodulator::close_direct(){
467  cudaStreamDestroy(internal_stream);
468  cudaFree(DIRECT_tone_frquencies);
469  cudaFree(DIRECT_tone_phases);
470  cudaFree(direct_input);
471  cudaFree(direct_output);
472  cudaFree(transposed);
473  free(DIRECT_tones);
474  if (parameters->decim>0){
475  for(size_t k=0; k<parameters->wave_type.size(); k++)delete(DIRECT_FIR[k]);
476  free(DIRECT_FIR);
477  free(fir_taps);
478  cudaFree(FIR_output);
479  }
480  return;
481 }
482 
483 //process a packet with the pfb and set the variables for the next
484 // returns the valid length of the output packet
485 int RX_buffer_demodulator::process_pfb(float2** __restrict__ input_buffer, float2** __restrict__ output_buffer){
486 
487  int output_buffer_valid_len;
488 
489  //upload the input buffer in a position determined by the buffer helper
490  cudaMemcpyAsync(
491  raw_input+buf_setting->new_0, //device address
492  *input_buffer, //host address
493  parameters->buffer_len*sizeof(float2),
494  cudaMemcpyHostToDevice,internal_stream);
495 
496  //apply the polyphase filter to the buffer
497  polyphase_filter_wrapper(raw_input,input,d_params,internal_stream);
498 
499  //execute the fft
500  cufftExecC2C(plan, input, output+spare_size, CUFFT_FORWARD);
501 
502  //move the part of the buffer that has already to be analyzed at the begin of the buffer
504  raw_input, //from
505  raw_input, //to
506  buf_setting->spare_samples, //size
507  buf_setting->spare_begin,0,
508  internal_stream); // destination offset
509 
510  if(decimator_active){
511 
512  pfb_decim_helper->update(buf_setting->current_batch);
513  spare_size = pfb_decim_helper->new_0;
514 
515  decimate_pfb(output,decim_output,parameters->fft_tones,parameters->decim,pfb_decim_helper->out_size,internal_stream);
516 
517  //move the spare buffer
519  output, //from
520  output, //to
521  pfb_decim_helper->new_0, //size
522  pfb_decim_helper->out_size,0,
523  internal_stream); // destination offset
524 
525  //move the selected tones in an other buffer excluding exceeding samples
526  tone_select_wrapper(output, reduced_output, d_params, std::floor(buf_setting->current_batch/(float)parameters->decim),internal_stream);
527 
528  //download the result in the host memory
529  cudaMemcpyAsync(*output_buffer,reduced_output,
530  parameters->wave_type.size()*batching*sizeof(float2),
531  cudaMemcpyDeviceToHost,internal_stream);
532 
533  output_buffer_valid_len = parameters->wave_type.size()*std::floor(buf_setting->current_batch/(float)parameters->decim);
534 
535  }else{
536 
537  //move the selected tones in an other buffer excluding exceeding samples
538  tone_select_wrapper(output, reduced_output, d_params, buf_setting->current_batch,internal_stream);
539 
540  //download the result in the host memory
541  cudaMemcpyAsync(*output_buffer,reduced_output,
542  parameters->wave_type.size()*batching*sizeof(float2),
543  cudaMemcpyDeviceToHost,internal_stream);
544 
545  output_buffer_valid_len = parameters->wave_type.size()*buf_setting->current_batch;
546  //print_debug("source batching is: ",buf_setting->current_batch);
547  }
548 
549 
550  //update the buffer helper
551  buf_setting->update();
552 
553  //wait for operation to be completed before returning
554  cudaStreamSynchronize(internal_stream);
555 
556  //fwrite(*output_buffer, output_buffer_valid_len * sizeof(float2), 1, raw_out_file);
557 
558 
559  //NOTE: this variable has been calculated before the buffer helper update
560  //is needed to know how many samples to stream
561  return output_buffer_valid_len;
562 
563 
564 }
565 
566 //same process as the pfb but there is no tone selection and the buffer is bully downloaded
567 int RX_buffer_demodulator::process_pfb_spec(float2** __restrict__ input_buffer, float2** __restrict__ output_buffer){
568 
569  int output_buffer_valid_len;
570 
571  //upload the input buffer in a position determined by the buffer helper
572  cudaMemcpyAsync(
573  raw_input+buf_setting->new_0, //device address
574  *input_buffer, //host address
575  parameters->buffer_len*sizeof(float2),
576  cudaMemcpyHostToDevice,internal_stream);
577 
578  //apply the polyphase filter to the buffer
579  polyphase_filter_wrapper(raw_input,input,d_params,internal_stream);
580 
581  //execute the fft
582  cufftExecC2C(plan, input, output+pfb_out, CUFFT_FORWARD);
583 
584  //move the part of the buffer that has already to be analyzed at the begin of the buffer
586  raw_input, //from
587  raw_input, //to
588  buf_setting->spare_samples, //size
589  buf_setting->spare_begin,0,
590  internal_stream); // destination offset
591 
592  if(decimator_active){
593 
594  /*
595  pfb_decim_helper->update(buf_setting->current_batch);
596  spare_size = pfb_decim_helper->new_0;
597  //std::cout<<"new_0: "<<spare_size<<std::endl<< "current_batch"<<buf_setting->current_batch<<std::endl<<"out_size"<<pfb_decim_helper->out_size<<std::endl;
598  decimate_pfb(output,decim_output,parameters->decim,parameters->fft_tones,pfb_decim_helper->out_size,internal_stream);
599  */
600 
601  //std::cout<<"Input len is: "<<buf_setting->spare_begin<<std::endl;
602  output_len = parameters->fft_tones*int((buf_setting->spare_begin/parameters->fft_tones)/(float)parameters->decim);
603  int input_len = output_len * parameters->decim;
604  //std::cout<<"Output len will be: "<<output_len<<std::endl;
605  decimate_spectra( output, decim_output, parameters->decim, parameters->fft_tones, input_len, output_len, internal_stream);
606  //move the spare buffer
607  //std::cout<<"Resifual buffer is: "<<buf_setting->spare_begin - input_len <<std::endl;
609  output, //from
610  output, //to
611  buf_setting->spare_begin - input_len, //size
612  input_len,0,
613  internal_stream); // destination offset
614 
615 
616  //download the result in the host memory
617  cudaMemcpyAsync(*output_buffer,decim_output,
618  buf_setting->copy_size*sizeof(float2),
619  cudaMemcpyDeviceToHost,internal_stream);
620 
621  //get the valid length for this buffer before updating the buffer helper but with decimation applied
622  //output_buffer_valid_len = pfb_decim_helper->out_size;
623  output_buffer_valid_len = output_len;
624 
625  }else{
626 
627 
628  //download the result in the host memory
629  cudaMemcpyAsync(*output_buffer,output,
630  buf_setting->copy_size*sizeof(float2),
631  cudaMemcpyDeviceToHost,internal_stream);
632 
633  //std::cout<<"Quick check on rx buffer"<<std::endl;
634  //for(int i = 0; i < 10; i++)std::cout<< i<<"\t"<<(*output_buffer)[i].x*1e7<<" +j*"<<(*output_buffer)[i].y*1e7<<std::endl;
635 
636  //get the valid length for this buffer before updating the buffer helper
637  output_buffer_valid_len = buf_setting->copy_size;
638  }
639 
640  //update the buffer helper
641  buf_setting->update();
642 
643  //wait for operation to be completed before returning
644  cudaStreamSynchronize(internal_stream);
645 
646  //NOTE: this variable has been calculated befor the buffer helper update
647  return output_buffer_valid_len;
648 }
649 //clean up the pfb allocations
650 void RX_buffer_demodulator::close_pfb(){
651  cufftDestroy(plan);
652  cudaStreamDestroy(internal_stream);
653  cudaFree(d_params);
654  cudaFree(input);
655  cudaFree(output);
656  cudaFree(reduced_output);
657  cudaFree(raw_input);
658  delete(buf_setting);
659  if(decimator_active){
660  delete(pfb_decim_helper);
661  cudaFree(decim_output);
662  }
663 }
664 
665 //clean up the pfb full spectrum
666 void RX_buffer_demodulator::close_pfb_spec(){
667  cufftDestroy(plan);
668  cudaStreamDestroy(internal_stream);
669  cudaFree(d_params);
670  cudaFree(input);
671  cudaFree(output);
672  cudaFree(raw_input);
673  delete(buf_setting);
674  if(decimator_active){
675  delete(pfb_decim_helper);
676  cudaFree(decim_output);
677  }
678 }
679 
680 //clean up the chirp demod allocation
681 void RX_buffer_demodulator::close_chirp(){
682  cudaStreamDestroy(internal_stream);
683  cudaFree(d_parameter);
684  cudaFree(input);
685  cudaFree(output);
686  if(decimator_active){
687  cudaFree(decim_output);
688  delete(vna_helper);
689  }
690 }
691 
692 void RX_buffer_demodulator::close_nodsp(){cudaStreamDestroy(internal_stream);}
693 
694 //converts general purpose parameters into kernel wrapper parameters on gpu.
695 //THIS ONLY TAKES CARE OF MULTI TONES MEASUREMENT
696 void RX_buffer_demodulator::upload_multitone_parameters(){
697 
698  //calculate the maximum batching. The number 3 at the end of the formula is empirical
699  // batching = std::floor((float)(parameters->buffer_len - parameters->fft_tones*parameters->pf_average)/(float)parameters->fft_tones) +3*parameters->fft_tones*parameters->pf_average;
700  batching = std::ceil((float)parameters->buffer_len/(float)parameters->fft_tones) + parameters->pf_average + 5;
701 
702 
703  //copy-paste parameters
704  h_param.window = window;
705  h_param.length = parameters->buffer_len;
706  h_param.n_tones = parameters->fft_tones;
707  h_param.average_buffer = parameters->pf_average;
708  h_param.batching = batching;
709 
710  //tones to be downloaded from the gpu at the end of a single packet analysis process
711  h_param.eff_n_tones = parameters->wave_type.size();
712 
713  int *tone_bins;
714  tone_bins = (int*)malloc(h_param.eff_n_tones*sizeof(int));
715 
716  std::vector<double> bin_axis(parameters->fft_tones);
717 
718  double bin_size = (double)parameters->rate/(double)parameters->fft_tones;
719 
720  for(size_t i = 0; i<bin_axis.size(); i++){
721  bin_axis[i] = i*bin_size - bin_size * (parameters->fft_tones/2);
722  for(int u = 0; u<h_param.eff_n_tones; u++){
723  if((parameters->freq[u] < bin_axis[i] + bin_size) && (parameters->freq[u] > bin_axis[i] - bin_size) ){
724  tone_bins[u] = (i + (parameters->fft_tones/2))%parameters->fft_tones;
725  //std::cout<<"parameter f: "<<parameters->freq[u]<<" goes in bin: "<<tone_bins[u]<<std::endl;
726  }
727  }
728  }
729  //convert the frequency parameter to fft bin
730  /*
731  for(int u = 0; u<h_param.eff_n_tones;u++){
732 
733  tone_bins[u] = parameters->freq[u]>0?
734  round((double)parameters->fft_tones * (double)parameters->freq[u]/(double)parameters->rate):
735  round((double)parameters->fft_tones*((double)1.-(double)parameters->freq[u]/(double)parameters->rate));
736  std::cout<<"parameter f: "<<parameters->freq[u]<<" goes in bin: "<<tone_bins[u]<<std::endl;
737  }
738  */
739  //allocate memory for the tone device pointer array
740  cudaMalloc((void **)&h_param.tones, h_param.eff_n_tones*sizeof(int));
741 
742  // Copy host memory to device
743  cudaMemcpy(h_param.tones, tone_bins, h_param.eff_n_tones*sizeof(int),cudaMemcpyHostToDevice);
744 
745  //in case of diagnostic, print frequency - bin table
746  if(diagnostic){
747  std::stringstream ss;
748  ss<<"Polyphase filter bank diagnostic:"<<std::endl<<"frequ\tbin"<<std::endl;
749  for(int u = 0; u<h_param.eff_n_tones;u++)ss<< parameters->freq[u]<<"\t"<<tone_bins[u]<<std::endl;
750  std::cout<<ss.str()<<std::endl;
751  }
752 
753  //cleanup the host version
754  free(tone_bins);
755 
756  //allocates spaces for the gpu copy of the parameter struct
757  cudaMalloc((void **)&d_params, sizeof(filter_param));
758 
759  // Copy the parameters to device
760  cudaMemcpy(d_params, &h_param, sizeof(filter_param),cudaMemcpyHostToDevice);
761 
762 }
void chirp_demodulator_wrapper(float2 *__restrict__ input, float2 *__restrict__ output, unsigned int output_size, unsigned long int last_index, chirp_parameter *__restrict__ info, cudaStream_t internal_stream)
Definition: kernels.cu:452
void decimate_pfb(float2 *__restrict__ input, float2 *__restrict__ output, int decim, int nfft, int output_length, cudaStream_t stram_f)
Definition: kernels.cu:900
void tone_select_wrapper(float2 *__restrict__ input, float2 *__restrict__ output, filter_param *__restrict__ filter_info, int effective_batching, cudaStream_t internal_stream)
Definition: kernels.cu:578
#define checkcublas(X)
unsigned long int num_steps
Definition: kernels.cuh:59
int * tones
How many samples per each tone are present in the device buffer.
Definition: kernels.cuh:53
float2 * make_flat_window(int length, int side, bool diagnostic)
Creates a flattop window in the GPU memory.
Definition: kernels.cu:208
float2 * window
Polyphase filter parameter wrapper and utility variables for buffer reminder.
Definition: kernels.cuh:48
unsigned long int length
Definition: kernels.cuh:60
int n_tones
Total length of the device buffer.
Definition: kernels.cuh:50
void print_error(std::string text)
float2 * make_sinc_window(int length, float fc, bool diagnostic=false, bool host_ret=false)
Definition: kernels.cu:258
RX_buffer_demodulator(param *init_parameters, bool init_diagnostic=false)
Initialization method for the class called when a new command is received. iagnostic allows to print ...
Definition: fir.hpp:7
void move_buffer_wrapper(float2 *__restrict__ from, float2 *__restrict__ to, int size, int from_offset, int to_offset, cudaStream_t internal_stream)
Definition: kernels.cu:482
int length
Pointer to an already initialized window.
Definition: kernels.cuh:49
unsigned int chirpness
Definition: kernels.cuh:61
float fcut
PFB cut-off frequency fo the window. 1.f is Nyquist at the higher sampling frequency. this parameter will be movoed to the param struct soon.
int process(float2 **__restrict__ in, float2 **__restrict__ out)
PAcket handler for DSP class. This method process information pointed by the in parameter and write t...
void polyphase_filter_wrapper(float2 *__restrict__ input, float2 *__restrict__ output, filter_param *__restrict__ filter_info, cudaStream_t internal_stream)
Definition: kernels.cu:540
int eff_n_tones
Must be an array containing the fft bin number corresponding to the tone frequency.
Definition: kernels.cuh:54
void print_warning(std::string text)
int average_buffer
How many points to calculate in the FFT.
Definition: kernels.cuh:51
void direct_demodulator_wrapper(int *__restrict tone_frequencies, int *__restrict tone_phases, int wavetablelen, size_t index_counter, size_t single_tone_length, size_t total_length, float2 *__restrict input, float2 *__restrict output, cudaStream_t internal_stream)
Definition: kernels.cu:89
int batching
How many buffer are averaged (length of the window has to be average_buffer * n_tones) ...
Definition: kernels.cuh:52
void close()
Wrapper to the correct cleaning function.
void decimate_spectra(float2 *__restrict__ input, float2 *__restrict__ output, int decim, int nfft, int input_len, int output_len, cudaStream_t stram_f)
Definition: kernels.cu:748
void cublas_decim(float2 *__restrict__ input, float2 *__restrict__ output, float2 *__restrict__ profile, cuComplex *__restrict__ zero, cuComplex *__restrict__ one, int ppt, int n_freqs, cublasHandle_t *__restrict__ handle)
Definition: kernels.cu:874