Visualizing audio can be quite complicated if you suffer the mathematical premises. But with some trial-and-error the final code is not that complicated.
You start by defining a few data structures:
1static int16_t left_bands[32]; // Left channel frequency bands
2static int16_t right_bands[32]; // Right channel frequency bands
3
4static RDFTContext *ctx;
5
6static int N, samples; // N and number of samples to process each step
The first step is to initialise the FFT library of libav. N is the size of the fft.
1void visualize_init(int samples_) {
2 samples = samples_;
3 N = samples_ / 2; // left/right channels
4 ctx = av_rdft_init((int) log2(N), DFT_R2C);
5}
Let’s start now to visualize our buffer:
1void buffer_visualize(int16_t *data) {
2 int i, tight_index; // just some iterator indices
3
4
5 float left_data[N * 2];
6 float right_data[N * 2];
data will contain the audio information in an interleaved manner. This means one int left, one int right, one int left and so on… So in the next step we’re going to split it, convert the integers to floats, apply a window function and write them to our temporary output buffer.
1 int16_t left = data[i];
2
3 double window_modifier = (0.5 * (1 - cos(2 * M_PI * tight_index / (N - 1)))); // Hann (Hanning) window function
4 float value = (float) (window_modifier * ((left) / 32768.0f)); // Convert to float and apply
5
6 // cap values above 1 and below -1
7 if (value > 1.0) {
8 value = 1;
9 } else if (value < -1.0) {
10 value = -1;
11 }
Also repeat this for the right channel. Finally we can pass our data to the fft library:
1 av_rdft_calc(ctx, left_data);
The next part is the real visualizion. You probably want to visualize it in an other way.
1 int size = N / 2 * 2; // half is usable, but we have re and im
2
3
4 for (i = 0, tight_index = 0; i < size; i += size / WIDTH) {
5
6 float im = left_data[i];
7 float re = left_data[i + 1];
8 double mag = sqrt(im * im + re * re);
9
10 // Visualize magnitude of i-th band
11 left_bands[tight_index] = (int16_t) (mag * HEIGHT);
12
13
14 tight_index++;
15 }
*The first bin in the FFT is DC (0 Hz), the second bin is Fs / N, where Fs is the sample rate and N is the size of the FFT. The next bin is 2 * Fs / N. To express this in general terms, the nth bin is n * Fs / N.
So if your sample rate, Fs is say 44.1 kHz and your FFT size, N is 1024, then the FFT output bins are at:*
0: 0 * 44100 / 1024 = 0.0 Hz
1: 1 * 44100 / 1024 = 43.1 Hz
2: 2 * 44100 / 1024 = 86.1 Hz
3: 3 * 44100 / 1024 = 129.2 Hz
4: ...
5: ...
...
511: 511 * 44100 / 1024 = 22006.9 Hz
Note that for a real input signal (imaginary parts all zero) the second half of the FFT (bins from N / 2 + 1 to N - 1) contain no useful additional information (they have complex conjugate symmetry with the first N / 2 - 1 bins). The last useful bin (for practical aplications) is at N / 2 - 1, which corresponds to 22006.9 Hz in the above example. The bin at N / 2 represents energy at the Nyquist frequency, i.e. Fs / 2 ( = 22050 Hz in this example), but this is in general not of any practical use, since anti-aliasing filters will typically attenuate any signals at and above Fs / 2. (Source: Stackoverflow)
But how to use this after decoding and resampling? You have to use AV_SAMPLE_FMT_S16 as output format. So inizialize the resampling library as follow:
1enum AVSampleFormat init_resampling(AVAudioResampleContext **out_resample, AVCodecContext *dec_ctx) {
2 AVAudioResampleContext *resample = avresample_alloc_context();
3
4 int64_t layout = av_get_default_channel_layout(dec_ctx->channels);
5 int sample_rate = dec_ctx->sample_rate;
6 enum AVSampleFormat output_fmt = AV_SAMPLE_FMT_S16;
7
8 av_opt_set_int(resample, "in_channel_layout", layout, 0);
9 av_opt_set_int(resample, "out_channel_layout", layout, 0);
10 av_opt_set_int(resample, "in_sample_rate", sample_rate, 0);
11 av_opt_set_int(resample, "out_sample_rate", sample_rate, 0);
12 av_opt_set_int(resample, "in_sample_fmt", dec_ctx->sample_fmt, 0);
13 av_opt_set_int(resample, "out_sample_fmt", output_fmt, 0);
14
15 avresample_open(resample);
16
17 *out_resample = resample;
18
19 return output_fmt;
20}
Then just decode it and pass it to our process function. The normalizing and resampling part can be quite difficult as it’s not that good documented but here’s an working example:
1// Packet
2AVPacket packet;
3av_init_packet(&packet);
4
5
6// Frame
7AVFrame *frame = avcodec_alloc_frame();
8
9// Contexts
10AVAudioResampleContext *resample = 0;
11AVFormatContext *fmt_ctx = 0;
12AVCodecContext *dec_ctx = 0;
13
14int audio_stream_index = open_file(file_path, &fmt_ctx, &dec_ctx);
15
16if (audio_stream_index < 0) {
17 av_log(NULL, AV_LOG_ERROR, "Error opening file\n");
18 return audio_stream_index;
19}
20
21// Setup resampling
22enum AVSampleFormat output_fmt = init_resampling(&resample, dec_ctx);
23
24visualize_init(4096 / sizeof(int16_t)); // 4096 is the default sample size of libav
25
26while (1) {
27 if ((av_read_frame(fmt_ctx, &packet)) < 0) {
28 break;
29 }
30
31 if (packet.stream_index == audio_stream_index) {
32 int got_frame = 0;
33
34 ret = avcodec_decode_audio4(dec_ctx, frame, &got_frame, &packet);
35 if (ret < 0) {
36 av_log(NULL, AV_LOG_ERROR, "Error decoding audio\n");
37 continue;
38 }
39
40
41 if (got_frame) {
42
43 //Normalize the stream by resampling it
44 uint8_t *output;
45 int out_linesize;
46 int out_samples = avresample_get_out_samples(resample, frame->nb_samples);
47 av_samples_alloc(&output, &out_linesize, 2, out_samples, output_fmt, 0);
48
49 avresample_convert(resample, &output, out_linesize, out_samples,
50 frame->data, frame->linesize[0], frame->nb_samples);
51
52 buffer_visualize((int16_t *) output);
53
54 av_freep(&output);
55 }
56 }
57}
The example source can be viewed here.