LCOV - code coverage report
Current view: top level - media/libopus/src - analysis.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 452 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 7 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* Copyright (c) 2011 Xiph.Org Foundation
       2             :    Written by Jean-Marc Valin */
       3             : /*
       4             :    Redistribution and use in source and binary forms, with or without
       5             :    modification, are permitted provided that the following conditions
       6             :    are met:
       7             : 
       8             :    - Redistributions of source code must retain the above copyright
       9             :    notice, this list of conditions and the following disclaimer.
      10             : 
      11             :    - Redistributions in binary form must reproduce the above copyright
      12             :    notice, this list of conditions and the following disclaimer in the
      13             :    documentation and/or other materials provided with the distribution.
      14             : 
      15             :    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
      16             :    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
      17             :    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
      18             :    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
      19             :    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
      20             :    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
      21             :    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
      22             :    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
      23             :    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
      24             :    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
      25             :    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      26             : */
      27             : 
      28             : #ifdef HAVE_CONFIG_H
      29             : #include "config.h"
      30             : #endif
      31             : 
      32             : #define ANALYSIS_C
      33             : 
      34             : #include <stdio.h>
      35             : 
      36             : #include "mathops.h"
      37             : #include "kiss_fft.h"
      38             : #include "celt.h"
      39             : #include "modes.h"
      40             : #include "arch.h"
      41             : #include "quant_bands.h"
      42             : #include "analysis.h"
      43             : #include "mlp.h"
      44             : #include "stack_alloc.h"
      45             : #include "float_cast.h"
      46             : 
      47             : #ifndef M_PI
      48             : #define M_PI 3.141592653
      49             : #endif
      50             : 
      51             : #ifndef DISABLE_FLOAT_API
      52             : 
      53             : static const float dct_table[128] = {
      54             :         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
      55             :         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
      56             :         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
      57             :        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
      58             :         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
      59             :        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
      60             :         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
      61             :         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
      62             :         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
      63             :         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
      64             :         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
      65             :        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
      66             :         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
      67             :        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
      68             :         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
      69             :         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
      70             : };
      71             : 
      72             : static const float analysis_window[240] = {
      73             :       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
      74             :       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
      75             :       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
      76             :       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
      77             :       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
      78             :       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
      79             :       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
      80             :       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
      81             :       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
      82             :       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
      83             :       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
      84             :       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
      85             :       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
      86             :       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
      87             :       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
      88             :       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
      89             :       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
      90             :       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
      91             :       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
      92             :       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
      93             :       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
      94             :       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
      95             :       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
      96             :       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
      97             :       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
      98             :       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
      99             :       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
     100             :       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
     101             :       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
     102             :       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
     103             : };
     104             : 
     105             : static const int tbands[NB_TBANDS+1] = {
     106             :       4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240
     107             : };
     108             : 
     109             : #define NB_TONAL_SKIP_BANDS 9
     110             : 
     111           0 : static opus_val32 silk_resampler_down2_hp(
     112             :     opus_val32                  *S,                 /* I/O  State vector [ 2 ]                                          */
     113             :     opus_val32                  *out,               /* O    Output signal [ floor(len/2) ]                              */
     114             :     const opus_val32            *in,                /* I    Input signal [ len ]                                        */
     115             :     int                         inLen               /* I    Number of input samples                                     */
     116             : )
     117             : {
     118           0 :     int k, len2 = inLen/2;
     119             :     opus_val32 in32, out32, out32_hp, Y, X;
     120           0 :     opus_val64 hp_ener = 0;
     121             :     /* Internal variables and state are in Q10 format */
     122           0 :     for( k = 0; k < len2; k++ ) {
     123             :         /* Convert to Q10 */
     124           0 :         in32 = in[ 2 * k ];
     125             : 
     126             :         /* All-pass section for even input sample */
     127           0 :         Y      = SUB32( in32, S[ 0 ] );
     128           0 :         X      = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y);
     129           0 :         out32  = ADD32( S[ 0 ], X );
     130           0 :         S[ 0 ] = ADD32( in32, X );
     131           0 :         out32_hp = out32;
     132             :         /* Convert to Q10 */
     133           0 :         in32 = in[ 2 * k + 1 ];
     134             : 
     135             :         /* All-pass section for odd input sample, and add to output of previous section */
     136           0 :         Y      = SUB32( in32, S[ 1 ] );
     137           0 :         X      = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
     138           0 :         out32  = ADD32( out32, S[ 1 ] );
     139           0 :         out32  = ADD32( out32, X );
     140           0 :         S[ 1 ] = ADD32( in32, X );
     141             : 
     142           0 :         Y      = SUB32( -in32, S[ 2 ] );
     143           0 :         X      = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
     144           0 :         out32_hp  = ADD32( out32_hp, S[ 2 ] );
     145           0 :         out32_hp  = ADD32( out32_hp, X );
     146           0 :         S[ 2 ] = ADD32( -in32, X );
     147             : 
     148           0 :         hp_ener += out32_hp*(opus_val64)out32_hp;
     149             :         /* Add, convert back to int16 and store to output */
     150           0 :         out[ k ] = HALF32(out32);
     151             :     }
     152             : #ifdef FIXED_POINT
     153             :     /* len2 can be up to 480, so we shift by 8 more to make it fit. */
     154             :     hp_ener = hp_ener >> (2*SIG_SHIFT + 8);
     155             : #endif
     156           0 :     return (opus_val32)hp_ener;
     157             : }
     158             : 
     159           0 : static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs)
     160             : {
     161             :    VARDECL(opus_val32, tmp);
     162             :    opus_val32 scale;
     163             :    int j;
     164           0 :    opus_val32 ret = 0;
     165             :    SAVE_STACK;
     166             : 
     167           0 :    if (subframe==0) return 0;
     168           0 :    if (Fs == 48000)
     169             :    {
     170           0 :       subframe *= 2;
     171           0 :       offset *= 2;
     172           0 :    } else if (Fs == 16000) {
     173           0 :       subframe = subframe*2/3;
     174           0 :       offset = offset*2/3;
     175             :    }
     176           0 :    ALLOC(tmp, subframe, opus_val32);
     177             : 
     178           0 :    downmix(_x, tmp, subframe, offset, c1, c2, C);
     179             : #ifdef FIXED_POINT
     180             :    scale = (1<<SIG_SHIFT);
     181             : #else
     182           0 :    scale = 1.f/32768;
     183             : #endif
     184           0 :    if (c2==-2)
     185           0 :       scale /= C;
     186           0 :    else if (c2>-1)
     187           0 :       scale /= 2;
     188           0 :    for (j=0;j<subframe;j++)
     189           0 :       tmp[j] *= scale;
     190           0 :    if (Fs == 48000)
     191             :    {
     192           0 :       ret = silk_resampler_down2_hp(S, y, tmp, subframe);
     193           0 :    } else if (Fs == 24000) {
     194           0 :       OPUS_COPY(y, tmp, subframe);
     195           0 :    } else if (Fs == 16000) {
     196             :       VARDECL(opus_val32, tmp3x);
     197           0 :       ALLOC(tmp3x, 3*subframe, opus_val32);
     198             :       /* Don't do this at home! This resampler is horrible and it's only (barely)
     199             :          usable for the purpose of the analysis because we don't care about all
     200             :          the aliasing between 8 kHz and 12 kHz. */
     201           0 :       for (j=0;j<subframe;j++)
     202             :       {
     203           0 :          tmp3x[3*j] = tmp[j];
     204           0 :          tmp3x[3*j+1] = tmp[j];
     205           0 :          tmp3x[3*j+2] = tmp[j];
     206             :       }
     207           0 :       silk_resampler_down2_hp(S, y, tmp3x, 3*subframe);
     208             :    }
     209             :    RESTORE_STACK;
     210           0 :    return ret;
     211             : }
     212             : 
     213           0 : void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs)
     214             : {
     215             :   /* Initialize reusable fields. */
     216           0 :   tonal->arch = opus_select_arch();
     217           0 :   tonal->Fs = Fs;
     218             :   /* Clear remaining fields. */
     219           0 :   tonality_analysis_reset(tonal);
     220           0 : }
     221             : 
     222           0 : void tonality_analysis_reset(TonalityAnalysisState *tonal)
     223             : {
     224             :   /* Clear non-reusable fields. */
     225           0 :   char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
     226           0 :   OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
     227           0 :   tonal->music_confidence = .9f;
     228           0 :   tonal->speech_confidence = .1f;
     229           0 : }
     230             : 
     231           0 : void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
     232             : {
     233             :    int pos;
     234             :    int curr_lookahead;
     235             :    float psum;
     236             :    float tonality_max;
     237             :    float tonality_avg;
     238             :    int tonality_count;
     239             :    int i;
     240             : 
     241           0 :    pos = tonal->read_pos;
     242           0 :    curr_lookahead = tonal->write_pos-tonal->read_pos;
     243           0 :    if (curr_lookahead<0)
     244           0 :       curr_lookahead += DETECT_SIZE;
     245             : 
     246             :    /* On long frames, look at the second analysis window rather than the first. */
     247           0 :    if (len > tonal->Fs/50 && pos != tonal->write_pos)
     248             :    {
     249           0 :       pos++;
     250           0 :       if (pos==DETECT_SIZE)
     251           0 :          pos=0;
     252             :    }
     253           0 :    if (pos == tonal->write_pos)
     254           0 :       pos--;
     255           0 :    if (pos<0)
     256           0 :       pos = DETECT_SIZE-1;
     257           0 :    OPUS_COPY(info_out, &tonal->info[pos], 1);
     258           0 :    tonality_max = tonality_avg = info_out->tonality;
     259           0 :    tonality_count = 1;
     260             :    /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */
     261           0 :    for (i=0;i<3;i++)
     262             :    {
     263           0 :       pos++;
     264           0 :       if (pos==DETECT_SIZE)
     265           0 :          pos = 0;
     266           0 :       if (pos == tonal->write_pos)
     267           0 :          break;
     268           0 :       tonality_max = MAX32(tonality_max, tonal->info[pos].tonality);
     269           0 :       tonality_avg += tonal->info[pos].tonality;
     270           0 :       tonality_count++;
     271             :    }
     272           0 :    info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f);
     273           0 :    tonal->read_subframe += len/(tonal->Fs/400);
     274           0 :    while (tonal->read_subframe>=8)
     275             :    {
     276           0 :       tonal->read_subframe -= 8;
     277           0 :       tonal->read_pos++;
     278             :    }
     279           0 :    if (tonal->read_pos>=DETECT_SIZE)
     280           0 :       tonal->read_pos-=DETECT_SIZE;
     281             : 
     282             :    /* The -1 is to compensate for the delay in the features themselves. */
     283           0 :    curr_lookahead = IMAX(curr_lookahead-1, 0);
     284             : 
     285           0 :    psum=0;
     286             :    /* Summing the probability of transition patterns that involve music at
     287             :       time (DETECT_SIZE-curr_lookahead-1) */
     288           0 :    for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
     289           0 :       psum += tonal->pmusic[i];
     290           0 :    for (;i<DETECT_SIZE;i++)
     291           0 :       psum += tonal->pspeech[i];
     292           0 :    psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
     293             :    /*printf("%f %f %f %f %f\n", psum, info_out->music_prob, info_out->vad_prob, info_out->activity_probability, info_out->tonality);*/
     294             : 
     295           0 :    info_out->music_prob = psum;
     296           0 : }
     297             : 
     298             : static const float std_feature_bias[9] = {
     299             :       5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f,
     300             :       2.163313f, 1.260756f, 1.116868f, 1.918795f
     301             : };
     302             : 
     303             : #define LEAKAGE_OFFSET 2.5f
     304             : #define LEAKAGE_SLOPE 2.f
     305             : 
     306             : #ifdef FIXED_POINT
     307             : /* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to
     308             :    compensate for that in the energy. */
     309             : #define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT)))
     310             : #define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e))
     311             : #else
     312             : #define SCALE_ENER(e) (e)
     313             : #endif
     314             : 
     315           0 : static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
     316             : {
     317             :     int i, b;
     318             :     const kiss_fft_state *kfft;
     319             :     VARDECL(kiss_fft_cpx, in);
     320             :     VARDECL(kiss_fft_cpx, out);
     321           0 :     int N = 480, N2=240;
     322           0 :     float * OPUS_RESTRICT A = tonal->angle;
     323           0 :     float * OPUS_RESTRICT dA = tonal->d_angle;
     324           0 :     float * OPUS_RESTRICT d2A = tonal->d2_angle;
     325             :     VARDECL(float, tonality);
     326             :     VARDECL(float, noisiness);
     327             :     float band_tonality[NB_TBANDS];
     328             :     float logE[NB_TBANDS];
     329             :     float BFCC[8];
     330             :     float features[25];
     331             :     float frame_tonality;
     332             :     float max_frame_tonality;
     333             :     /*float tw_sum=0;*/
     334             :     float frame_noisiness;
     335           0 :     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
     336           0 :     float slope=0;
     337             :     float frame_stationarity;
     338             :     float relativeE;
     339             :     float frame_probs[2];
     340             :     float alpha, alphaE, alphaE2;
     341             :     float frame_loudness;
     342             :     float bandwidth_mask;
     343           0 :     int bandwidth=0;
     344           0 :     float maxE = 0;
     345             :     float noise_floor;
     346             :     int remaining;
     347             :     AnalysisInfo *info;
     348             :     float hp_ener;
     349             :     float tonality2[240];
     350             :     float midE[8];
     351           0 :     float spec_variability=0;
     352             :     float band_log2[NB_TBANDS+1];
     353             :     float leakage_from[NB_TBANDS+1];
     354             :     float leakage_to[NB_TBANDS+1];
     355             :     SAVE_STACK;
     356             : 
     357           0 :     alpha = 1.f/IMIN(10, 1+tonal->count);
     358           0 :     alphaE = 1.f/IMIN(25, 1+tonal->count);
     359           0 :     alphaE2 = 1.f/IMIN(500, 1+tonal->count);
     360             : 
     361           0 :     if (tonal->Fs == 48000)
     362             :     {
     363             :        /* len and offset are now at 24 kHz. */
     364           0 :        len/= 2;
     365           0 :        offset /= 2;
     366           0 :     } else if (tonal->Fs == 16000) {
     367           0 :        len = 3*len/2;
     368           0 :        offset = 3*offset/2;
     369             :     }
     370             : 
     371           0 :     if (tonal->count<4) {
     372           0 :        if (tonal->application == OPUS_APPLICATION_VOIP)
     373           0 :           tonal->music_prob = .1f;
     374             :        else
     375           0 :           tonal->music_prob = .625f;
     376             :     }
     377           0 :     kfft = celt_mode->mdct.kfft[0];
     378           0 :     if (tonal->count==0)
     379           0 :        tonal->mem_fill = 240;
     380           0 :     tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x,
     381           0 :           &tonal->inmem[tonal->mem_fill], tonal->downmix_state,
     382           0 :           IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs);
     383           0 :     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
     384             :     {
     385           0 :        tonal->mem_fill += len;
     386             :        /* Don't have enough to update the analysis */
     387             :        RESTORE_STACK;
     388           0 :        return;
     389             :     }
     390           0 :     hp_ener = tonal->hp_ener_accum;
     391           0 :     info = &tonal->info[tonal->write_pos++];
     392           0 :     if (tonal->write_pos>=DETECT_SIZE)
     393           0 :        tonal->write_pos-=DETECT_SIZE;
     394             : 
     395           0 :     ALLOC(in, 480, kiss_fft_cpx);
     396           0 :     ALLOC(out, 480, kiss_fft_cpx);
     397           0 :     ALLOC(tonality, 240, float);
     398           0 :     ALLOC(noisiness, 240, float);
     399           0 :     for (i=0;i<N2;i++)
     400             :     {
     401           0 :        float w = analysis_window[i];
     402           0 :        in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
     403           0 :        in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
     404           0 :        in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
     405           0 :        in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
     406             :     }
     407           0 :     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
     408           0 :     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
     409           0 :     tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x,
     410           0 :           &tonal->inmem[240], tonal->downmix_state, remaining,
     411           0 :           offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
     412           0 :     tonal->mem_fill = 240 + remaining;
     413           0 :     opus_fft(kfft, in, out, tonal->arch);
     414             : #ifndef FIXED_POINT
     415             :     /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
     416           0 :     if (celt_isnan(out[0].r))
     417             :     {
     418           0 :        info->valid = 0;
     419             :        RESTORE_STACK;
     420           0 :        return;
     421             :     }
     422             : #endif
     423             : 
     424           0 :     for (i=1;i<N2;i++)
     425             :     {
     426             :        float X1r, X2r, X1i, X2i;
     427             :        float angle, d_angle, d2_angle;
     428             :        float angle2, d_angle2, d2_angle2;
     429             :        float mod1, mod2, avg_mod;
     430           0 :        X1r = (float)out[i].r+out[N-i].r;
     431           0 :        X1i = (float)out[i].i-out[N-i].i;
     432           0 :        X2r = (float)out[i].i+out[N-i].i;
     433           0 :        X2i = (float)out[N-i].r-out[i].r;
     434             : 
     435           0 :        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
     436           0 :        d_angle = angle - A[i];
     437           0 :        d2_angle = d_angle - dA[i];
     438             : 
     439           0 :        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
     440           0 :        d_angle2 = angle2 - angle;
     441           0 :        d2_angle2 = d_angle2 - d_angle;
     442             : 
     443           0 :        mod1 = d2_angle - (float)float2int(d2_angle);
     444           0 :        noisiness[i] = ABS16(mod1);
     445           0 :        mod1 *= mod1;
     446           0 :        mod1 *= mod1;
     447             : 
     448           0 :        mod2 = d2_angle2 - (float)float2int(d2_angle2);
     449           0 :        noisiness[i] += ABS16(mod2);
     450           0 :        mod2 *= mod2;
     451           0 :        mod2 *= mod2;
     452             : 
     453           0 :        avg_mod = .25f*(d2A[i]+mod1+2*mod2);
     454             :        /* This introduces an extra delay of 2 frames in the detection. */
     455           0 :        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
     456             :        /* No delay on this detection, but it's less reliable. */
     457           0 :        tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f;
     458             : 
     459           0 :        A[i] = angle2;
     460           0 :        dA[i] = d_angle2;
     461           0 :        d2A[i] = mod2;
     462             :     }
     463           0 :     for (i=2;i<N2-1;i++)
     464             :     {
     465           0 :        float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1]));
     466           0 :        tonality[i] = .9f*MAX32(tonality[i], tt-.1f);
     467             :     }
     468           0 :     frame_tonality = 0;
     469           0 :     max_frame_tonality = 0;
     470             :     /*tw_sum = 0;*/
     471           0 :     info->activity = 0;
     472           0 :     frame_noisiness = 0;
     473           0 :     frame_stationarity = 0;
     474           0 :     if (!tonal->count)
     475             :     {
     476           0 :        for (b=0;b<NB_TBANDS;b++)
     477             :        {
     478           0 :           tonal->lowE[b] = 1e10;
     479           0 :           tonal->highE[b] = -1e10;
     480             :        }
     481             :     }
     482           0 :     relativeE = 0;
     483           0 :     frame_loudness = 0;
     484             :     /* The energy of the very first band is special because of DC. */
     485             :     {
     486           0 :        float E = 0;
     487             :        float X1r, X2r;
     488           0 :        X1r = 2*(float)out[0].r;
     489           0 :        X2r = 2*(float)out[0].i;
     490           0 :        E = X1r*X1r + X2r*X2r;
     491           0 :        for (i=1;i<4;i++)
     492             :        {
     493           0 :           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
     494           0 :                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
     495           0 :           E += binE;
     496             :        }
     497           0 :        E = SCALE_ENER(E);
     498           0 :        band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f);
     499             :     }
     500           0 :     for (b=0;b<NB_TBANDS;b++)
     501             :     {
     502           0 :        float E=0, tE=0, nE=0;
     503             :        float L1, L2;
     504             :        float stationarity;
     505           0 :        for (i=tbands[b];i<tbands[b+1];i++)
     506             :        {
     507           0 :           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
     508           0 :                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
     509           0 :           binE = SCALE_ENER(binE);
     510           0 :           E += binE;
     511           0 :           tE += binE*MAX32(0, tonality[i]);
     512           0 :           nE += binE*2.f*(.5f-noisiness[i]);
     513             :        }
     514             : #ifndef FIXED_POINT
     515             :        /* Check for extreme band energies that could cause NaNs later. */
     516           0 :        if (!(E<1e9f) || celt_isnan(E))
     517             :        {
     518           0 :           info->valid = 0;
     519             :           RESTORE_STACK;
     520           0 :           return;
     521             :        }
     522             : #endif
     523             : 
     524           0 :        tonal->E[tonal->E_count][b] = E;
     525           0 :        frame_noisiness += nE/(1e-15f+E);
     526             : 
     527           0 :        frame_loudness += (float)sqrt(E+1e-10f);
     528           0 :        logE[b] = (float)log(E+1e-10f);
     529           0 :        band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f);
     530           0 :        tonal->logE[tonal->E_count][b] = logE[b];
     531           0 :        if (tonal->count==0)
     532           0 :           tonal->highE[b] = tonal->lowE[b] = logE[b];
     533           0 :        if (tonal->highE[b] > tonal->lowE[b] + 7.5)
     534             :        {
     535           0 :           if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b])
     536           0 :              tonal->highE[b] -= .01f;
     537             :           else
     538           0 :              tonal->lowE[b] += .01f;
     539             :        }
     540           0 :        if (logE[b] > tonal->highE[b])
     541             :        {
     542           0 :           tonal->highE[b] = logE[b];
     543           0 :           tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]);
     544           0 :        } else if (logE[b] < tonal->lowE[b])
     545             :        {
     546           0 :           tonal->lowE[b] = logE[b];
     547           0 :           tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]);
     548             :        }
     549           0 :        relativeE += (logE[b]-tonal->lowE[b])/(1e-15f + (tonal->highE[b]-tonal->lowE[b]));
     550             : 
     551           0 :        L1=L2=0;
     552           0 :        for (i=0;i<NB_FRAMES;i++)
     553             :        {
     554           0 :           L1 += (float)sqrt(tonal->E[i][b]);
     555           0 :           L2 += tonal->E[i][b];
     556             :        }
     557             : 
     558           0 :        stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
     559           0 :        stationarity *= stationarity;
     560           0 :        stationarity *= stationarity;
     561           0 :        frame_stationarity += stationarity;
     562             :        /*band_tonality[b] = tE/(1e-15+E)*/;
     563           0 :        band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
     564             : #if 0
     565             :        if (b>=NB_TONAL_SKIP_BANDS)
     566             :        {
     567             :           frame_tonality += tweight[b]*band_tonality[b];
     568             :           tw_sum += tweight[b];
     569             :        }
     570             : #else
     571           0 :        frame_tonality += band_tonality[b];
     572           0 :        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
     573           0 :           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
     574             : #endif
     575           0 :        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
     576           0 :        slope += band_tonality[b]*(b-8);
     577             :        /*printf("%f %f ", band_tonality[b], stationarity);*/
     578           0 :        tonal->prev_band_tonality[b] = band_tonality[b];
     579             :     }
     580             : 
     581           0 :     leakage_from[0] = band_log2[0];
     582           0 :     leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET;
     583           0 :     for (b=1;b<NB_TBANDS+1;b++)
     584             :     {
     585           0 :        float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4;
     586           0 :        leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]);
     587           0 :        leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OFFSET);
     588             :     }
     589           0 :     for (b=NB_TBANDS-2;b>=0;b--)
     590             :     {
     591           0 :        float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4;
     592           0 :        leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]);
     593           0 :        leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]);
     594             :     }
     595             :     celt_assert(NB_TBANDS+1 <= LEAK_BANDS);
     596           0 :     for (b=0;b<NB_TBANDS+1;b++)
     597             :     {
     598             :        /* leak_boost[] is made up of two terms. The first, based on leakage_to[],
     599             :           represents the boost needed to overcome the amount of analysis leakage
     600             :           cause in a weaker band b by louder neighbouring bands.
     601             :           The second, based on leakage_from[], applies to a loud band b for
     602             :           which the quantization noise causes synthesis leakage to the weaker
     603             :           neighbouring bands. */
     604           0 :        float boost = MAX16(0, leakage_to[b] - band_log2[b]) +
     605           0 :              MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET));
     606           0 :        info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost));
     607             :     }
     608           0 :     for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0;
     609             : 
     610           0 :     for (i=0;i<NB_FRAMES;i++)
     611             :     {
     612             :        int j;
     613           0 :        float mindist = 1e15f;
     614           0 :        for (j=0;j<NB_FRAMES;j++)
     615             :        {
     616             :           int k;
     617           0 :           float dist=0;
     618           0 :           for (k=0;k<NB_TBANDS;k++)
     619             :           {
     620             :              float tmp;
     621           0 :              tmp = tonal->logE[i][k] - tonal->logE[j][k];
     622           0 :              dist += tmp*tmp;
     623             :           }
     624           0 :           if (j!=i)
     625           0 :              mindist = MIN32(mindist, dist);
     626             :        }
     627           0 :        spec_variability += mindist;
     628             :     }
     629           0 :     spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS);
     630           0 :     bandwidth_mask = 0;
     631           0 :     bandwidth = 0;
     632           0 :     maxE = 0;
     633           0 :     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
     634           0 :     noise_floor *= noise_floor;
     635           0 :     for (b=0;b<NB_TBANDS;b++)
     636             :     {
     637           0 :        float E=0;
     638             :        int band_start, band_end;
     639             :        /* Keep a margin of 300 Hz for aliasing */
     640           0 :        band_start = tbands[b];
     641           0 :        band_end = tbands[b+1];
     642           0 :        for (i=band_start;i<band_end;i++)
     643             :        {
     644           0 :           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
     645           0 :                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
     646           0 :           E += binE;
     647             :        }
     648           0 :        E = SCALE_ENER(E);
     649           0 :        maxE = MAX32(maxE, E);
     650           0 :        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
     651           0 :        E = MAX32(E, tonal->meanE[b]);
     652             :        /* Use a simple follower with 13 dB/Bark slope for spreading function */
     653           0 :        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
     654             :        /* Consider the band "active" only if all these conditions are met:
     655             :           1) less than 10 dB below the simple follower
     656             :           2) less than 90 dB below the peak band (maximal masking possible considering
     657             :              both the ATH and the loudness-dependent slope of the spreading function)
     658             :           3) above the PCM quantization noise floor
     659             :           We use b+1 because the first CELT band isn't included in tbands[]
     660             :        */
     661           0 :        if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
     662           0 :           bandwidth = b+1;
     663             :     }
     664             :     /* Special case for the last two bands, for which we don't have spectrum but only
     665             :        the energy above 12 kHz. */
     666           0 :     if (tonal->Fs == 48000) {
     667             :        float ratio;
     668           0 :        float E = hp_ener*(1.f/(240*240));
     669           0 :        ratio = tonal->prev_bandwidth==20 ? 0.03f : 0.07f;
     670             : #ifdef FIXED_POINT
     671             :        /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */
     672             :        E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE);
     673             : #endif
     674           0 :        maxE = MAX32(maxE, E);
     675           0 :        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
     676           0 :        E = MAX32(E, tonal->meanE[b]);
     677             :        /* Use a simple follower with 13 dB/Bark slope for spreading function */
     678           0 :        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
     679           0 :        if (E>ratio*bandwidth_mask && E*1e9f > maxE && E > noise_floor*160)
     680           0 :           bandwidth = 20;
     681             :        /* This detector is unreliable, so if the bandwidth is close to SWB, assume it's FB. */
     682           0 :        if (bandwidth >= 17)
     683           0 :           bandwidth = 20;
     684             :     }
     685           0 :     if (tonal->count<=2)
     686           0 :        bandwidth = 20;
     687           0 :     frame_loudness = 20*(float)log10(frame_loudness);
     688           0 :     tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness);
     689           0 :     tonal->lowECount *= (1-alphaE);
     690           0 :     if (frame_loudness < tonal->Etracker-30)
     691           0 :        tonal->lowECount += alphaE;
     692             : 
     693           0 :     for (i=0;i<8;i++)
     694             :     {
     695           0 :        float sum=0;
     696           0 :        for (b=0;b<16;b++)
     697           0 :           sum += dct_table[i*16+b]*logE[b];
     698           0 :        BFCC[i] = sum;
     699             :     }
     700           0 :     for (i=0;i<8;i++)
     701             :     {
     702           0 :        float sum=0;
     703           0 :        for (b=0;b<16;b++)
     704           0 :           sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]);
     705           0 :        midE[i] = sum;
     706             :     }
     707             : 
     708           0 :     frame_stationarity /= NB_TBANDS;
     709           0 :     relativeE /= NB_TBANDS;
     710           0 :     if (tonal->count<10)
     711           0 :        relativeE = .5f;
     712           0 :     frame_noisiness /= NB_TBANDS;
     713             : #if 1
     714           0 :     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
     715             : #else
     716             :     info->activity = .5*(1+frame_noisiness-frame_stationarity);
     717             : #endif
     718           0 :     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
     719           0 :     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
     720           0 :     tonal->prev_tonality = frame_tonality;
     721             : 
     722           0 :     slope /= 8*8;
     723           0 :     info->tonality_slope = slope;
     724             : 
     725           0 :     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
     726           0 :     tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX);
     727           0 :     info->tonality = frame_tonality;
     728             : 
     729           0 :     for (i=0;i<4;i++)
     730           0 :        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
     731             : 
     732           0 :     for (i=0;i<4;i++)
     733           0 :        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
     734             : 
     735           0 :     for (i=0;i<4;i++)
     736           0 :         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
     737           0 :     for (i=0;i<3;i++)
     738           0 :         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
     739             : 
     740           0 :     if (tonal->count > 5)
     741             :     {
     742           0 :        for (i=0;i<9;i++)
     743           0 :           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
     744             :     }
     745           0 :     for (i=0;i<4;i++)
     746           0 :        features[i] = BFCC[i]-midE[i];
     747             : 
     748           0 :     for (i=0;i<8;i++)
     749             :     {
     750           0 :        tonal->mem[i+24] = tonal->mem[i+16];
     751           0 :        tonal->mem[i+16] = tonal->mem[i+8];
     752           0 :        tonal->mem[i+8] = tonal->mem[i];
     753           0 :        tonal->mem[i] = BFCC[i];
     754             :     }
     755           0 :     for (i=0;i<9;i++)
     756           0 :        features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
     757           0 :     features[18] = spec_variability - 0.78f;
     758           0 :     features[20] = info->tonality - 0.154723f;
     759           0 :     features[21] = info->activity - 0.724643f;
     760           0 :     features[22] = frame_stationarity - 0.743717f;
     761           0 :     features[23] = info->tonality_slope + 0.069216f;
     762           0 :     features[24] = tonal->lowECount - 0.067930f;
     763             : 
     764           0 :     mlp_process(&net, features, frame_probs);
     765           0 :     frame_probs[0] = .5f*(frame_probs[0]+1);
     766             :     /* Curve fitting between the MLP probability and the actual probability */
     767             :     /*frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);*/
     768             :     /* Probability of active audio (as opposed to silence) */
     769           0 :     frame_probs[1] = .5f*frame_probs[1]+.5f;
     770           0 :     frame_probs[1] *= frame_probs[1];
     771             : 
     772             :     /* Probability of speech or music vs noise */
     773           0 :     info->activity_probability = frame_probs[1];
     774             : 
     775             :     /*printf("%f %f\n", frame_probs[0], frame_probs[1]);*/
     776             :     {
     777             :        /* Probability of state transition */
     778             :        float tau;
     779             :        /* Represents independence of the MLP probabilities, where
     780             :           beta=1 means fully independent. */
     781             :        float beta;
     782             :        /* Denormalized probability of speech (p0) and music (p1) after update */
     783             :        float p0, p1;
     784             :        /* Probabilities for "all speech" and "all music" */
     785             :        float s0, m0;
     786             :        /* Probability sum for renormalisation */
     787             :        float psum;
     788             :        /* Instantaneous probability of speech and music, with beta pre-applied. */
     789             :        float speech0;
     790             :        float music0;
     791             :        float p, q;
     792             : 
     793             :        /* More silence transitions for speech than for music. */
     794           0 :        tau = .001f*tonal->music_prob + .01f*(1-tonal->music_prob);
     795           0 :        p = MAX16(.05f,MIN16(.95f,frame_probs[1]));
     796           0 :        q = MAX16(.05f,MIN16(.95f,tonal->vad_prob));
     797           0 :        beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
     798             :        /* p0 and p1 are the probabilities of speech and music at this frame
     799             :           using only information from previous frame and applying the
     800             :           state transition model */
     801           0 :        p0 = (1-tonal->vad_prob)*(1-tau) +    tonal->vad_prob *tau;
     802           0 :        p1 =    tonal->vad_prob *(1-tau) + (1-tonal->vad_prob)*tau;
     803             :        /* We apply the current probability with exponent beta to work around
     804             :           the fact that the probability estimates aren't independent. */
     805           0 :        p0 *= (float)pow(1-frame_probs[1], beta);
     806           0 :        p1 *= (float)pow(frame_probs[1], beta);
     807             :        /* Normalise the probabilities to get the Marokv probability of music. */
     808           0 :        tonal->vad_prob = p1/(p0+p1);
     809           0 :        info->vad_prob = tonal->vad_prob;
     810             :        /* Consider that silence has a 50-50 probability of being speech or music. */
     811           0 :        frame_probs[0] = tonal->vad_prob*frame_probs[0] + (1-tonal->vad_prob)*.5f;
     812             : 
     813             :        /* One transition every 3 minutes of active audio */
     814           0 :        tau = .0001f;
     815             :        /* Adapt beta based on how "unexpected" the new prob is */
     816           0 :        p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
     817           0 :        q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
     818           0 :        beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
     819             :        /* p0 and p1 are the probabilities of speech and music at this frame
     820             :           using only information from previous frame and applying the
     821             :           state transition model */
     822           0 :        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
     823           0 :        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
     824             :        /* We apply the current probability with exponent beta to work around
     825             :           the fact that the probability estimates aren't independent. */
     826           0 :        p0 *= (float)pow(1-frame_probs[0], beta);
     827           0 :        p1 *= (float)pow(frame_probs[0], beta);
     828             :        /* Normalise the probabilities to get the Marokv probability of music. */
     829           0 :        tonal->music_prob = p1/(p0+p1);
     830           0 :        info->music_prob = tonal->music_prob;
     831             : 
     832             :        /*printf("%f %f %f %f\n", frame_probs[0], frame_probs[1], tonal->music_prob, tonal->vad_prob);*/
     833             :        /* This chunk of code deals with delayed decision. */
     834           0 :        psum=1e-20f;
     835             :        /* Instantaneous probability of speech and music, with beta pre-applied. */
     836           0 :        speech0 = (float)pow(1-frame_probs[0], beta);
     837           0 :        music0  = (float)pow(frame_probs[0], beta);
     838           0 :        if (tonal->count==1)
     839             :        {
     840           0 :           if (tonal->application == OPUS_APPLICATION_VOIP)
     841           0 :              tonal->pmusic[0] = .1f;
     842             :           else
     843           0 :              tonal->pmusic[0] = .625f;
     844           0 :           tonal->pspeech[0] = 1-tonal->pmusic[0];
     845             :        }
     846             :        /* Updated probability of having only speech (s0) or only music (m0),
     847             :           before considering the new observation. */
     848           0 :        s0 = tonal->pspeech[0] + tonal->pspeech[1];
     849           0 :        m0 = tonal->pmusic [0] + tonal->pmusic [1];
     850             :        /* Updates s0 and m0 with instantaneous probability. */
     851           0 :        tonal->pspeech[0] = s0*(1-tau)*speech0;
     852           0 :        tonal->pmusic [0] = m0*(1-tau)*music0;
     853             :        /* Propagate the transition probabilities */
     854           0 :        for (i=1;i<DETECT_SIZE-1;i++)
     855             :        {
     856           0 :           tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
     857           0 :           tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
     858             :        }
     859             :        /* Probability that the latest frame is speech, when all the previous ones were music. */
     860           0 :        tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
     861             :        /* Probability that the latest frame is music, when all the previous ones were speech. */
     862           0 :        tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
     863             : 
     864             :        /* Renormalise probabilities to 1 */
     865           0 :        for (i=0;i<DETECT_SIZE;i++)
     866           0 :           psum += tonal->pspeech[i] + tonal->pmusic[i];
     867           0 :        psum = 1.f/psum;
     868           0 :        for (i=0;i<DETECT_SIZE;i++)
     869             :        {
     870           0 :           tonal->pspeech[i] *= psum;
     871           0 :           tonal->pmusic [i] *= psum;
     872             :        }
     873           0 :        psum = tonal->pmusic[0];
     874           0 :        for (i=1;i<DETECT_SIZE;i++)
     875           0 :           psum += tonal->pspeech[i];
     876             : 
     877             :        /* Estimate our confidence in the speech/music decisions */
     878           0 :        if (frame_probs[1]>.75)
     879             :        {
     880           0 :           if (tonal->music_prob>.9)
     881             :           {
     882             :              float adapt;
     883           0 :              adapt = 1.f/(++tonal->music_confidence_count);
     884           0 :              tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
     885           0 :              tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
     886             :           }
     887           0 :           if (tonal->music_prob<.1)
     888             :           {
     889             :              float adapt;
     890           0 :              adapt = 1.f/(++tonal->speech_confidence_count);
     891           0 :              tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
     892           0 :              tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
     893             :           }
     894             :        }
     895             :     }
     896           0 :     tonal->last_music = tonal->music_prob>.5f;
     897             : #ifdef MLP_TRAINING
     898             :     for (i=0;i<25;i++)
     899             :        printf("%f ", features[i]);
     900             :     printf("\n");
     901             : #endif
     902             : 
     903           0 :     info->bandwidth = bandwidth;
     904           0 :     tonal->prev_bandwidth = bandwidth;
     905             :     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
     906           0 :     info->noisiness = frame_noisiness;
     907           0 :     info->valid = 1;
     908             :     RESTORE_STACK;
     909             : }
     910             : 
     911           0 : void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
     912             :                  int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
     913             :                  int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
     914             : {
     915             :    int offset;
     916             :    int pcm_len;
     917             : 
     918           0 :    analysis_frame_size -= analysis_frame_size&1;
     919           0 :    if (analysis_pcm != NULL)
     920             :    {
     921             :       /* Avoid overflow/wrap-around of the analysis buffer */
     922           0 :       analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size);
     923             : 
     924           0 :       pcm_len = analysis_frame_size - analysis->analysis_offset;
     925           0 :       offset = analysis->analysis_offset;
     926           0 :       while (pcm_len>0) {
     927           0 :          tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
     928           0 :          offset += Fs/50;
     929           0 :          pcm_len -= Fs/50;
     930             :       }
     931           0 :       analysis->analysis_offset = analysis_frame_size;
     932             : 
     933           0 :       analysis->analysis_offset -= frame_size;
     934             :    }
     935             : 
     936           0 :    analysis_info->valid = 0;
     937           0 :    tonality_get_info(analysis, analysis_info, frame_size);
     938           0 : }
     939             : 
     940             : #endif /* DISABLE_FLOAT_API */

Generated by: LCOV version 1.13