From 2eab6b1baa216642fe02db07800a5785b6668f7d Mon Sep 17 00:00:00 2001 From: Thomas Fillon Date: Fri, 3 Oct 2014 13:25:30 +0200 Subject: [PATCH] merge limssi_sad from diadems into dev --- timeside/analyzer/limsi_sad.py | 157 ++++++++++++++++++++++++++++----- 1 file changed, 136 insertions(+), 21 deletions(-) diff --git a/timeside/analyzer/limsi_sad.py b/timeside/analyzer/limsi_sad.py index 2d0af7b..7396965 100644 --- a/timeside/analyzer/limsi_sad.py +++ b/timeside/analyzer/limsi_sad.py @@ -24,16 +24,17 @@ from timeside.analyzer.core import Analyzer from timeside.api import IAnalyzer import timeside -from ..tools.parameters import Enum, HasTraits +from ..tools.parameters import Enum, HasTraits, Float, Tuple -import yaafelib import numpy as np import pickle import os.path class GMM: - + """ + Gaussian Mixture Model + """ def __init__(self, weights, means, vars): self.weights = weights self.means = means @@ -46,36 +47,100 @@ class GMM: - 2 * np.dot(x, (self.means / self.vars).T) + np.dot(x ** 2, (1.0 / self.vars).T)) + np.log(self.weights) - m = np.amax(llh, 1) + m = np.amax(llh,1) dif = llh - np.atleast_2d(m).T - return m + np.log(np.sum(np.exp(dif), 1)) + return m + np.log(np.sum(np.exp(dif),1)) -class LimsiSad(Analyzer): +def slidewinmap(lin, winsize, func): + """ + map a function to a list of elements using a sliding window + the window is centered on the element to process + missing values required by the windows corresponding to the beginning, or end + of the signal are replaced with the first, or last, element of the list + + Parameters: + ---------- + lin: input (list) + winsize: size of the sliding windows in samples (int) + func: function to be mapped on sliding windows + """ + tmpin = ([lin[0]] * (winsize/2)) + list(lin) + ([lin[-1]] * (winsize -1 - winsize/2)) + lout = [] + for i in xrange(len(lin)): + lout.append(func(tmpin[i:(i+winsize)])) + assert(len(lin) == len(lout)) + return lout +def dilatation(lin, winsize): + """ + morphological dilation + """ + return slidewinmap(lin, winsize, max) + +def erosion(lin, winsize): + """ + morphological erosion + """ + return slidewinmap(lin, winsize, min) + + +class LimsiSad(Analyzer): """ Limsi Speech Activity Detection Systems - LimsiSad performs frame level speech activity detection based on GMM models + LimsiSad performs frame level speech activity detection based on trained GMM models For each frame, it computes the log likelihood difference between a speech model and a non speech model. The highest is the estimate, the largest is the probability that the frame corresponds to speech. - The initialization of the analyzer requires to chose a model between 'etape' and 'maya' - 'etape' models were trained on data distributed in the framework of the ETAPE campaign (http://www.afcp-parole.org/etape.html) - 'maya' models were obtained on data collected by EREA – Centre Enseignement et Recherche en Ethnologie Amerindienne + Dilatation and erosion procedures are used in a latter stage to obtain speech and non speech segments + + The analyser outputs 3 result structures: + * sad_lhh_diff: the raw frame level speech/non speech log likelihood difference + * sad_de_lhh_diff: frame level speech/non speech log likelihood difference + altered with erosion and dilatation procedures + * sad_segments: speech/non speech segments """ implements(IAnalyzer) # Define Parameters class _Param(HasTraits): - sad_model = Enum('etape', 'maya') + sad_model = Enum('etape', 'maya') + dews = Float + speech_threshold = Float + dllh_bounds = Tuple(Float, Float) - def __init__(self, sad_model='etape'): + def __init__(self, sad_model='etape', dews=0.2, speech_threshold=1., + dllh_bounds=(-10., 10.)): """ Parameters: ---------- - sad_model : string bellowing to 'etape' 'maya' - alllows the selection of a SAD model: - 'etape' is more suited to radionews material - 'maya' is more suited to speech obtained in noisy environments + + sad_model : string bellowing to ['etape', 'maya'] + Allows the selection of trained speech activity detection models. + * 'etape' models were trained on data distributed in the framework of the + ETAPE campaign (http://www.afcp-parole.org/etape.html) + These models are suited for radionews material (0.974 AUC on Etape data) + * 'maya' models were obtained on data collected by EREA – Centre + Enseignement et Recherche en Ethnologie Amerindienne + These models are suited to speech obtained in noisy environments + (0.915 AUC on Maya data) + + + dews: dilatation and erosion window size (seconds) + This value correspond to the size in seconds of the sliding window + used to perform a dilation followed by an erosion procedure + these procedures consist to output the max (respectively the min) of the + speech detection estimate. The order of these procedures is aimed at removing + non-speech frames corresponding to fricatives or short pauses + The size of the windows correspond to the minimal size of the resulting + speech/non speech segments + + speech_threshold: threshold used for speech/non speech decision + based on the log likelihood difference + + dllh_bounds: raw log likelihood difference estimates will be bound + according this (min_llh_difference, max_llh_difference) tuple + Usefull for plotting log likelihood differences + if set to None, no bounding will be done """ super(LimsiSad, self).__init__() @@ -103,6 +168,10 @@ class LimsiSad(Analyzer): timeside.__path__[0], 'analyzer', 'trained_models', 'limsi_sad_%s.pkl' % sad_model) self.gmms = pickle.load(open(picfname, 'rb')) + self.dews = dews + self.speech_threshold = speech_threshold + self.dllh_bounds = dllh_bounds + @staticmethod @interfacedoc def id(): @@ -127,20 +196,66 @@ class LimsiSad(Analyzer): return frames, eod def post_process(self): + # extract signal features yaafe_result = self.process_pipe.results[self.parents['yaafe'].uuid()] mfcc = yaafe_result['yaafe.mfcc']['data_object']['value'] mfccd1 = yaafe_result['yaafe.mfccd1']['data_object']['value'] mfccd2 = yaafe_result['yaafe.mfccd2']['data_object']['value'] zcr = yaafe_result['yaafe.zcr']['data_object']['value'] - features = np.concatenate((mfcc, mfccd1, mfccd2, zcr), axis=1) - res = 0.5 + 0.5 * \ - (self.gmms[0].llh(features) - self.gmms[1].llh(features)) + # compute log likelihood difference + res = 0.5 + 0.5 * (self.gmms[0].llh(features) - self.gmms[1].llh(features)) + # bounds log likelihood difference + if self.dllh_bounds is not None: + mindiff, maxdiff = self.dllh_bounds + res = np.minimum(np.maximum(res, mindiff), maxdiff) + + # performs dilation, erosion, erosion, dilatation + ws = int(self.dews * float(self.input_samplerate ) / self.input_stepsize) + deed_llh = dilatation(erosion(erosion(dilatation(res, ws), ws), ws), ws) + + # infer speech and non speech segments from dilated + # and erroded likelihood difference estimate + last = None + labels = [] + times = [] + durations = [] + for i, val in enumerate([1 if e > self.speech_threshold else 0 for e in deed_llh]): + if val != last: + labels.append(val) + durations.append(1) + times.append(i) + else: + durations[-1] += 1 + last = val + times = [(float(e) * self.input_stepsize) / self.input_samplerate for e in times] + durations = [(float(e) * self.input_stepsize) / self.input_samplerate for e in durations] + + + # outputs the raw frame level speech/non speech log likelihood difference sad_result = self.new_result(data_mode='value', time_mode='framewise') sad_result.id_metadata.id += '.' + 'sad_lhh_diff' - sad_result.id_metadata.name += ' ' + \ - 'Speech Activity Detection Log Likelihood Difference' + sad_result.id_metadata.name += ' ' + 'Speech Activity Detection Log Likelihood Difference' sad_result.data_object.value = res self.add_result(sad_result) + + # outputs frame level speech/non speech log likelihood difference + # altered with erosion and dilatation procedures + sad_de_result = self.new_result(data_mode='value', time_mode='framewise') + sad_de_result.id_metadata.id += '.' + 'sad_de_lhh_diff' + sad_de_result.id_metadata.name += ' ' + 'Speech Activity Detection Log Likelihood Difference | dilat | erode' + sad_de_result.data_object.value = deed_llh + self.add_result(sad_de_result) + + # outputs speech/non speech segments + sad_seg_result = self.new_result(data_mode='label', time_mode='segment') + sad_seg_result.id_metadata.id += '.' + 'sad_segments' + sad_seg_result.id_metadata.name += ' ' + 'Speech Activity Detection Segments' + sad_seg_result.data_object.label = labels + sad_seg_result.data_object.time = times + sad_seg_result.data_object.duration = durations + sad_seg_result.data_object.label_metadata.label = {0: 'Not Speech', 1: 'Speech'} + + self.add_result(sad_seg_result) -- 2.39.5