USRP_Server  2.0
A flexible, GPU-accelerated radio-frequency readout software.
kernels.cuh
Go to the documentation of this file.
1 /* @file
2  * @brief function prototypes for the GPU kernels.
3  *
4  * Contains the prototype of the functions that runs the GPU and their wrappers.
5  *
6  * @todo Clean all the decimator that you don't use.
7  * @todo Add noise generation support.
8 */
9 
10 #pragma once
11 #ifndef GPU_KERNELS_INCUDED_h
12 #define GPU_KERNELS_INCUDED_h
13 
14 #include <cuda_runtime.h>
15 #include "cublas_v2.h"
16 #include <curand.h>
17 #include <curand_kernel.h>
18 #include <cufft.h>
19 
20 #include <thrust/device_vector.h>
21 #include <thrust/device_ptr.h>
22 
23 #include "fir.hpp"
24 
26 #include <iostream>
27 #include <stdio.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <stdlib.h>
31 #include <iostream>
33 #ifndef pi_f
34 #define pi_f 3.14159265358979f
35 #define Q_PHASE_alt -1.570796327f
36 #define _31_BIT_VALUE 2147483647.5
37 #endif
38 
39 //could be used to tune for other GPUs. NOTE: it also defines the shared memory
41 #define PFB_DECIM_TPB 64. //Threads per block
42 
43 
44 
45 
46 struct filter_param {
48  float2* window;
49  int length;
50  int n_tones;
52  int batching;
53  int* tones;
55 
56 };
57 
59  unsigned long int num_steps; //number of frequency change in the chirp signal (1 means simple sinus).
60  unsigned long int length; //total length of the each frequency in the chirp signal (in samples).
61  unsigned int chirpness; //coefficient for quedratic phase calculation.
62  int f0; //start frequency
63  float freq_norm; //coefficient to adapt buffer samples to TX/RX frequency (deprecated).
64 };
65 
68  int tones_number; //how many tones to generate
69  int* tone_frquencies; //tones frequencies in Hz (frequency resolution will be 1Hz, host side)
70  float* tones_amplitudes; //tones amplitudes (linear, host side)
71 };
72 
73 
75 __global__ void direct_demodulator_fp64(
76  double* __restrict tone_frquencies,
77  size_t index_counter,
78  uint single_tone_length,
79  size_t total_length,
80  float2* __restrict intput,
81  float2* __restrict output
82 );
83 
85 __global__ void direct_demodulator_integer(
86  int* __restrict tone_frequencies,
87  int* __restrict tone_phases,
88  int wavetablelen,
89  size_t index_counter,
90  size_t single_tone_length,
91  size_t total_length,
92  float2* __restrict input,
93  float2* __restrict output
94 );
95 
96 
100  int* __restrict tone_frequencies,
101  int* __restrict tone_phases,
102  int wavetablelen,
103  size_t index_counter,
104  size_t single_tone_length,
105  size_t total_length,
106  float2* __restrict input,
107  float2* __restrict output,
108  cudaStream_t internal_stream
109 );
110 
111 void chirp_gen_wrapper(
112  float2* __restrict__ output, //pointer to the gpu buffer
113  unsigned int output_size, //size of the buffer
114  chirp_parameter* __restrict__ info, //chirp information
115  unsigned long int last_index,
116  cudaStream_t internal_stream,
117  float scale //scale the amplitude of the chirp
118 );
119 
121  float2* __restrict__ input, //pointer to the input buffer
122  float2* __restrict__ output, //pointer to the gpu buffer
123  unsigned int output_size, //size of the buffers
124  unsigned long int last_index,
125  chirp_parameter* __restrict__ info, //chirp information
126  cudaStream_t internal_stream
127 );
128 
130  float2* __restrict__ from,
131  float2* __restrict__ to,
132  int size,
133  int from_offset,
134  int to_offset,
135  cudaStream_t internal_stream
136 );
138  float2* __restrict__ input,
139  float2* __restrict__ output,
140  filter_param* __restrict__ filter_info,
141  cudaStream_t internal_stream
142 );
143 
145  float2* __restrict__ input, //must be the fft output
146  float2* __restrict__ output,//the buffer that will then be downloaded to host
147  filter_param* __restrict__ filter_info, //information about the filtering process
148  int effective_batching, //how many samples per tone have been effectively calculated
149  cudaStream_t internal_stream
150 );
151 //allocates memory on gpu and fills with a real hamming window. returns a pointer to the window on the device.
152 //note that this is a host function that wraps some device calls
153 template <typename T>
154 T* make_hamming_window(int length, int side, bool diagnostic, bool host_ret);
155 float2* make_hamming_window(int length, int side, bool diagnostic, bool host_ret);
156 
157 //allocates memory on gpu and fills with a real sinc window. returns a pointer to the window on the device.
158 //note that this is a host function that wraps some device calls
159 float2* make_sinc_window(int length, float fc, bool diagnostic, bool host_ret);
160 
162 float2* make_flat_window(int length, int side, bool diagnostic);
163 
164 void print_chirp_params(std::string comment, chirp_parameter cp);
165 
166 //generate a set of tones and return host pointer to the buffer unless the device option is true.
167 //NOTE the length of the buffer is the sampling_rate
168 float2* tone_gen(
169  tone_parameters* info, //tone information (all host side)
170  int sampling_rate,
171  float scale = 1., //scale the whole buffer (all tones) for a scalar
172  bool device = false//the function return device buffer instead
173  );
174 
175 void decimate_spectra(
176  float2* __restrict__ input, //output of the pfb
177  float2* __restrict__ output,//decimated output
178  int decim, //decimation factor (multiplicative to the pfb one)
179  int nfft, //length of the fft
180  int input_len, //could be calculated inside but I wrote an apposite class for it
181  int output_len,
182  cudaStream_t stram_f //stream on which to launch the decimator
183  );
184 
185 //decimate the output of the fft without tone selection
186 //NOTE: this thread has to be launched from its wrapper or witha Nblocks*Nthreads == out_len and
187 //it is not protected from accessing data outside input_len (see wrapper)
188 
189 #ifdef CUBLAS_API_H_
190 // cuBLAS API errors
191 void _cudaGetErrorEnum(cublasStatus_t error);
192 #endif
193 void cublas_decim(
194  float2* __restrict__ input,
195  float2* __restrict__ output,
196  float2* __restrict__ profile,
197  cuComplex* __restrict__ zero,
198  cuComplex* __restrict__ one,
199  int ppt,
200  int n_freqs,
201  cublasHandle_t* __restrict__ handle
202  );
203 
204 //wrapper for the previous fft decimation function. decimates the pfb output.
205 //NOTE: this function does not take care of the reminder and suppose that calculation
206 //to determine the output_length has already been externally done.
207 void decimate_pfb(
208  float2* __restrict__ input, //output of the pfb
209  float2* __restrict__ output,//decimated output
210  int decim, //decimation factor (multiplicative to the pfb one)
211  int nfft, //length of the fft
212  int output_length, //could be calculated inside but I wrote an apposite class for it
213  cudaStream_t stram_f //stream on which to launch the decimator
214  );
215 
216 void D_cublas_decim(
217  double2* __restrict__ input,
218  double2* __restrict__ output,
219  double2* __restrict__ profile,
220  cuDoubleComplex* __restrict__ zero,
221  cuDoubleComplex* __restrict__ one,
222  int ppt,
223  int n_freqs,
224  cublasHandle_t* __restrict__ handle
225  );
226 #endif
void D_cublas_decim(double2 *__restrict__ input, double2 *__restrict__ output, double2 *__restrict__ profile, cuDoubleComplex *__restrict__ zero, cuDoubleComplex *__restrict__ one, int ppt, int n_freqs, cublasHandle_t *__restrict__ handle)
Definition: kernels.cu:917
void cublas_decim(float2 *__restrict__ input, float2 *__restrict__ output, float2 *__restrict__ profile, cuComplex *__restrict__ zero, cuComplex *__restrict__ one, int ppt, int n_freqs, cublasHandle_t *__restrict__ handle)
Definition: kernels.cu:874
int * tone_frquencies
Definition: kernels.cuh:69
Descriptor of the mutitone generation.
Definition: kernels.cuh:67
void tone_select_wrapper(float2 *__restrict__ input, float2 *__restrict__ output, filter_param *__restrict__ filter_info, int effective_batching, cudaStream_t internal_stream)
Definition: kernels.cu:578
void decimate_spectra(float2 *__restrict__ input, float2 *__restrict__ output, int decim, int nfft, int input_len, int output_len, cudaStream_t stram_f)
Definition: kernels.cu:748
unsigned long int num_steps
Definition: kernels.cuh:59
void decimate_pfb(float2 *__restrict__ input, float2 *__restrict__ output, int decim, int nfft, int output_length, cudaStream_t stram_f)
Definition: kernels.cu:900
int * tones
How many samples per each tone are present in the device buffer.
Definition: kernels.cuh:53
float2 * window
Polyphase filter parameter wrapper and utility variables for buffer reminder.
Definition: kernels.cuh:48
unsigned long int length
Definition: kernels.cuh:60
int n_tones
Total length of the device buffer.
Definition: kernels.cuh:50
__global__ void direct_demodulator_fp64(double *__restrict tone_frquencies, size_t index_counter, uint single_tone_length, size_t total_length, float2 *__restrict intput, float2 *__restrict output)
Direct demodulation kernel. This kernel takes the raw input from the SDR and separate channels...
Definition: kernels.cu:11
float freq_norm
Definition: kernels.cuh:63
void move_buffer_wrapper(float2 *__restrict__ from, float2 *__restrict__ to, int size, int from_offset, int to_offset, cudaStream_t internal_stream)
Definition: kernels.cu:482
__global__ void direct_demodulator_integer(int *__restrict tone_frequencies, int *__restrict tone_phases, int wavetablelen, size_t index_counter, size_t single_tone_length, size_t total_length, float2 *__restrict input, float2 *__restrict output)
Integer version of the direct demodulation kernel (numerically more stable). This kernel takes the ra...
Definition: kernels.cu:45
int length
Pointer to an already initialized window.
Definition: kernels.cuh:49
unsigned int chirpness
Definition: kernels.cuh:61
T * make_hamming_window(int length, int side, bool diagnostic, bool host_ret)
Definition: kernels.cu:107
float2 * tone_gen(tone_parameters *info, int sampling_rate, float scale=1., bool device=false)
Definition: kernels.cu:611
void direct_demodulator_wrapper(int *__restrict tone_frequencies, int *__restrict tone_phases, int wavetablelen, size_t index_counter, size_t single_tone_length, size_t total_length, float2 *__restrict input, float2 *__restrict output, cudaStream_t internal_stream)
Definition: kernels.cu:89
float * tones_amplitudes
Definition: kernels.cuh:70
void print_chirp_params(std::string comment, chirp_parameter cp)
Definition: kernels.cu:335
float2 * make_sinc_window(int length, float fc, bool diagnostic, bool host_ret)
Definition: kernels.cu:258
int eff_n_tones
Must be an array containing the fft bin number corresponding to the tone frequency.
Definition: kernels.cuh:54
void chirp_demodulator_wrapper(float2 *__restrict__ input, float2 *__restrict__ output, unsigned int output_size, unsigned long int last_index, chirp_parameter *__restrict__ info, cudaStream_t internal_stream)
Definition: kernels.cu:452
int average_buffer
How many points to calculate in the FFT.
Definition: kernels.cuh:51
float2 * make_flat_window(int length, int side, bool diagnostic)
Creates a flattop window in the GPU memory.
Definition: kernels.cu:208
void polyphase_filter_wrapper(float2 *__restrict__ input, float2 *__restrict__ output, filter_param *__restrict__ filter_info, cudaStream_t internal_stream)
Definition: kernels.cu:540
int batching
How many buffer are averaged (length of the window has to be average_buffer * n_tones) ...
Definition: kernels.cuh:52
void chirp_gen_wrapper(float2 *__restrict__ output, unsigned int output_size, chirp_parameter *__restrict__ info, unsigned long int last_index, cudaStream_t internal_stream, float scale)
Definition: kernels.cu:395