limsi frame by frame speech activity detection system

author David Doukhan <david.doukhan@gmail.com>

Fri, 24 Jan 2014 17:35:49 +0000 (18:35 +0100)

committer David Doukhan <david.doukhan@gmail.com>

Fri, 24 Jan 2014 17:35:49 +0000 (18:35 +0100)
author David Doukhan <david.doukhan@gmail.com>
Fri, 24 Jan 2014 17:35:49 +0000 (18:35 +0100)
committer David Doukhan <david.doukhan@gmail.com>
Fri, 24 Jan 2014 17:35:49 +0000 (18:35 +0100)
diff --git a/timeside/analyzer/__init__.py b/timeside/analyzer/__init__.py

index 4c293f17ede44fd6219c106a58e595ab64fb4d81..a8d8b22f4266de463cb8f6b19f2143d98239a65e 100644 (file)
--- a/timeside/analyzer/__init__.py
+++ b/timeside/analyzer/__init__.py
@@ -14,3 +14,4 @@ from vamp_plugin import VampSimpleHost
  from irit_speech_entropy import IRITSpeechEntropy
  from irit_speech_4hz import IRITSpeech4Hz
  from odf import OnsetDetectionFunction
+from limsi_sad import LimsiSad
diff --git a/timeside/analyzer/limsi_sad.py b/timeside/analyzer/limsi_sad.py

new file mode 100644 (file)

index 0000000..b503317
--- /dev/null
+++ b/timeside/analyzer/limsi_sad.py
@@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2013 David Doukhan <doukhan@limsi.fr>
+
+# This file is part of TimeSide.
+
+# TimeSide is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 2 of the License, or
+# (at your option) any later version.
+
+# TimeSide is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with TimeSide.  If not, see <http://www.gnu.org/licenses/>.
+
+# Author: David Doukhan <doukhan@limsi.fr>
+
+from timeside.core import implements, interfacedoc
+from timeside.analyzer.core import Analyzer
+from timeside.api import IAnalyzer
+import timeside
+from yaafe import Yaafe
+import yaafelib
+import numpy as N
+import pickle
+import os.path
+
+class GMM:
+
+    def __init__(self, weights, means, vars):
+        self.weights = weights
+        self.means = means
+        self.vars = vars
+
+    def llh(self, x):
+        n_samples, n_dim = x.shape
+        llh = -0.5 * (n_dim * N.log(2 * N.pi) + N.sum(N.log(self.vars), 1)
+                      + N.sum((self.means ** 2) / self.vars, 1)
+                      - 2 * N.dot(x, (self.means / self.vars).T)
+                      + N.dot(x ** 2, (1.0 / self.vars).T))
+        + N.log(self.weights)
+        m = N.amax(llh,1)
+        dif = llh - N.atleast_2d(m).T
+        return m + N.log(N.sum(N.exp(dif),1))
+
+
+class LimsiSad(Analyzer):
+    """
+    Limsi Speech Activity Detection Systems
+    LimsiSad performs frame level speech activity detection based on GMM models
+    For each frame, it computes the log likelihood difference between a speech model and a non speech model. The highest is the estimate, the largest is the probability that the frame corresponds to speech.
+    The initialization of the analyzer requires to chose a model between 'etape' and 'maya'
+    'etape' models were obtained on data collected by LIMSI in the framework of ETAPE ANR project
+    'maya' models were obtained on data collected by EREA – Centre Enseignement et Recherche en Ethnologie Amerindienne
+    """
+    implements(IAnalyzer)
+    
+
+    def __init__(self, sad_model):
+        """
+        Parameters:
+        ----------
+        sad_model : string bellowing to 'etape' 'maya'
+        alllows the selection of a SAD model:
+        'etape' is more suited to radionews material
+        'maya' is more suited to speech obtained in noisy environments
+        """
+        super(LimsiSad, self).__init__()
+
+        # feature extraction defition
+        spec = yaafelib.FeaturePlan(sample_rate=16000)
+        spec.addFeature('mfcc: MFCC CepsIgnoreFirstCoeff=0 blockSize=1024 stepSize=256')
+        spec.addFeature('mfccd1: MFCC CepsIgnoreFirstCoeff=0 blockSize=1024 stepSize=256 > Derivate DOrder=1')
+        spec.addFeature('mfccd2: MFCC CepsIgnoreFirstCoeff=0 blockSize=1024 stepSize=256 > Derivate DOrder=2')
+        spec.addFeature('zcr: ZCR blockSize=1024 stepSize=256')
+        parent_analyzer = Yaafe(spec)        
+        self.parents.append(parent_analyzer)
+
+        # informative parameters
+        # these are not really taken into account by the system
+        # these are bypassed by yaafe feature plan
+        self.input_blocksize = 1024
+        self.input_stepsize = 256
+
+        # load gmm model
+        if sad_model not in ['etape', 'maya']:
+            raise ValueError("argument sad_model %s not supported. Supported values are 'etape' or 'maya'" % sad_model)
+        picfname = os.path.join(timeside.__path__[0], 'trained_models', 'limsi_sad_%s.pkl' % sad_model)
+        self.gmms = pickle.load(open(picfname, 'rb'))
+
+
+    @staticmethod
+    @interfacedoc
+    def id():
+        return "limsi_sad"
+
+    @staticmethod
+    @interfacedoc
+    def name():
+        return "Limsi speech activity detection system"
+
+    @staticmethod
+    @interfacedoc
+    def unit():
+        # return the unit of the data dB, St, ...
+        return "Log Probability difference"
+
+    def process(self, frames, eod=False):
+        if self.input_samplerate != 16000:
+            raise Exception('%s requires 16000 input sample rate: %d provided' % (self.__class__.__name__, self.input_samplerate))
+        return frames, eod
+
+    def post_process(self):
+        mfcc = self.process_pipe.results['yaafe.mfcc']['data_object']['value']
+        mfccd1 = self.process_pipe.results['yaafe.mfccd1']['data_object']['value']
+        mfccd2 = self.process_pipe.results['yaafe.mfccd2']['data_object']['value']
+        zcr = self.process_pipe.results['yaafe.zcr']['data_object']['value']
+
+        features = N.concatenate((mfcc, mfccd1, mfccd2, zcr), axis=1)
+
+        res = 0.5 + 0.5 * (self.gmms[0].llh(features) - self.gmms[1].llh(features))
+
+        sad_result = self.new_result(data_mode='value', time_mode='framewise')
+        sad_result.id_metadata.id += '.' + 'sad_lhh_diff'
+        sad_result.id_metadata.name += ' ' + 'Speech Activity Detection Log Likelihood Difference'
+        sad_result.data_object.value = res
+        self.process_pipe.results.add(sad_result)
+
author	David Doukhan <david.doukhan@gmail.com>
	Fri, 24 Jan 2014 17:35:49 +0000 (18:35 +0100)
committer	David Doukhan <david.doukhan@gmail.com>
	Fri, 24 Jan 2014 17:35:49 +0000 (18:35 +0100)
timeside/analyzer/__init__.py		patch \| blob \| history
timeside/analyzer/limsi_sad.py	[new file with mode: 0644]	patch \| blob