From: David Doukhan Date: Fri, 24 Jan 2014 17:35:49 +0000 (+0100) Subject: limsi frame by frame speech activity detection system X-Git-Url: https://git.parisson.com/?a=commitdiff_plain;h=df04e2130725803faf9623ade8c4d97ac60c484e;p=timeside-diadems.git limsi frame by frame speech activity detection system --- diff --git a/timeside/analyzer/__init__.py b/timeside/analyzer/__init__.py index 4c293f1..a8d8b22 100644 --- a/timeside/analyzer/__init__.py +++ b/timeside/analyzer/__init__.py @@ -14,3 +14,4 @@ from vamp_plugin import VampSimpleHost from irit_speech_entropy import IRITSpeechEntropy from irit_speech_4hz import IRITSpeech4Hz from odf import OnsetDetectionFunction +from limsi_sad import LimsiSad diff --git a/timeside/analyzer/limsi_sad.py b/timeside/analyzer/limsi_sad.py new file mode 100644 index 0000000..b503317 --- /dev/null +++ b/timeside/analyzer/limsi_sad.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 David Doukhan + +# This file is part of TimeSide. + +# TimeSide is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 2 of the License, or +# (at your option) any later version. + +# TimeSide is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with TimeSide. If not, see . + +# Author: David Doukhan + +from timeside.core import implements, interfacedoc +from timeside.analyzer.core import Analyzer +from timeside.api import IAnalyzer +import timeside +from yaafe import Yaafe +import yaafelib +import numpy as N +import pickle +import os.path + +class GMM: + + def __init__(self, weights, means, vars): + self.weights = weights + self.means = means + self.vars = vars + + def llh(self, x): + n_samples, n_dim = x.shape + llh = -0.5 * (n_dim * N.log(2 * N.pi) + N.sum(N.log(self.vars), 1) + + N.sum((self.means ** 2) / self.vars, 1) + - 2 * N.dot(x, (self.means / self.vars).T) + + N.dot(x ** 2, (1.0 / self.vars).T)) + + N.log(self.weights) + m = N.amax(llh,1) + dif = llh - N.atleast_2d(m).T + return m + N.log(N.sum(N.exp(dif),1)) + + +class LimsiSad(Analyzer): + """ + Limsi Speech Activity Detection Systems + LimsiSad performs frame level speech activity detection based on GMM models + For each frame, it computes the log likelihood difference between a speech model and a non speech model. The highest is the estimate, the largest is the probability that the frame corresponds to speech. + The initialization of the analyzer requires to chose a model between 'etape' and 'maya' + 'etape' models were obtained on data collected by LIMSI in the framework of ETAPE ANR project + 'maya' models were obtained on data collected by EREA – Centre Enseignement et Recherche en Ethnologie Amerindienne + """ + implements(IAnalyzer) + + + def __init__(self, sad_model): + """ + Parameters: + ---------- + sad_model : string bellowing to 'etape' 'maya' + alllows the selection of a SAD model: + 'etape' is more suited to radionews material + 'maya' is more suited to speech obtained in noisy environments + """ + super(LimsiSad, self).__init__() + + # feature extraction defition + spec = yaafelib.FeaturePlan(sample_rate=16000) + spec.addFeature('mfcc: MFCC CepsIgnoreFirstCoeff=0 blockSize=1024 stepSize=256') + spec.addFeature('mfccd1: MFCC CepsIgnoreFirstCoeff=0 blockSize=1024 stepSize=256 > Derivate DOrder=1') + spec.addFeature('mfccd2: MFCC CepsIgnoreFirstCoeff=0 blockSize=1024 stepSize=256 > Derivate DOrder=2') + spec.addFeature('zcr: ZCR blockSize=1024 stepSize=256') + parent_analyzer = Yaafe(spec) + self.parents.append(parent_analyzer) + + # informative parameters + # these are not really taken into account by the system + # these are bypassed by yaafe feature plan + self.input_blocksize = 1024 + self.input_stepsize = 256 + + # load gmm model + if sad_model not in ['etape', 'maya']: + raise ValueError("argument sad_model %s not supported. Supported values are 'etape' or 'maya'" % sad_model) + picfname = os.path.join(timeside.__path__[0], 'trained_models', 'limsi_sad_%s.pkl' % sad_model) + self.gmms = pickle.load(open(picfname, 'rb')) + + + @staticmethod + @interfacedoc + def id(): + return "limsi_sad" + + @staticmethod + @interfacedoc + def name(): + return "Limsi speech activity detection system" + + @staticmethod + @interfacedoc + def unit(): + # return the unit of the data dB, St, ... + return "Log Probability difference" + + def process(self, frames, eod=False): + if self.input_samplerate != 16000: + raise Exception('%s requires 16000 input sample rate: %d provided' % (self.__class__.__name__, self.input_samplerate)) + return frames, eod + + def post_process(self): + mfcc = self.process_pipe.results['yaafe.mfcc']['data_object']['value'] + mfccd1 = self.process_pipe.results['yaafe.mfccd1']['data_object']['value'] + mfccd2 = self.process_pipe.results['yaafe.mfccd2']['data_object']['value'] + zcr = self.process_pipe.results['yaafe.zcr']['data_object']['value'] + + features = N.concatenate((mfcc, mfccd1, mfccd2, zcr), axis=1) + + res = 0.5 + 0.5 * (self.gmms[0].llh(features) - self.gmms[1].llh(features)) + + sad_result = self.new_result(data_mode='value', time_mode='framewise') + sad_result.id_metadata.id += '.' + 'sad_lhh_diff' + sad_result.id_metadata.name += ' ' + 'Speech Activity Detection Log Likelihood Difference' + sad_result.data_object.value = res + self.process_pipe.results.add(sad_result) +