from timeside.core import implements, interfacedoc
from timeside.analyzer.core import Analyzer
from timeside.api import IAnalyzer
--# from timeside.analyzer import IRITSpeech4Hz
from yaafe import Yaafe
import yaafelib
- from timeside.analyzer import LimsiSad
+ from timeside.analyzer.limsi_sad import LimsiSad
import numpy as N
import sys
return frames, eod
def post_process(self):
-
+ # extract mfcc with yaafe and store them to be used with pyannote
- mfcc = self.process_pipe.results['yaafe.mfccchop']['data_object']['value']
- sw = YaafeFrame(self.input_blocksize, self.input_stepsize, self.input_samplerate)
- pyannotefeat = SlidingWindowFeature(mfcc, sw)
+ mfcc = self.process_pipe.results.get_result_by_id('yaafe.mfccchop')['data_object']['value']
+ sw = YaafeFrame(self.input_blocksize, self.input_stepsize, self.input_samplerate)
+ pyannotefeat = SlidingWindowFeature(mfcc, sw)
- # speech activity detection: usefull for debugging purpose only
- # print 'adding sad res to result'
- # sadres = self.new_result(data_mode='value', time_mode='framewise')
- # sadres.id_metadata.id += '.' + 'sadres'
- # sadres.id_metadata.name += ' ' + 'SAD RESULT'
- sadval = self.process_pipe.results.get_result_by_id(self.sad_analyzer.id() + '.sad_lhh_diff').data_object.value[:]
- # sadres.data_object.value = sadval
- # self.process_pipe.results.add(sadres)
-
# gaussian divergence window size
- #winframesize = int((self.input_samplerate / self.input_stepsize) * self.chop_window_size)
timestepsize = self.input_stepsize / float(self.input_samplerate)
gdiff_win_size_frame = int(self.gdiff_win_size_sec / timestepsize)
min_seg_size_frame = int(self.min_seg_size_sec / timestepsize)
- # print 'timestepsize %d, gdiffwinsize (sec: %f, frame: %d) , minsegsize (sec: %f, frame: %d)' % (timestepsize, self.gdiff_win_size_sec, gdiff_win_size_frame, self.min_seg_size_sec, min_seg_size_frame)
-
-
- # basic gauss div
- #bgd = (range(0, len(mfcc)), 'basicgdiv')
- # speech gauss div
- sgd = ([i for i, val in enumerate(sadval) if val > 0], 'speechgdiv')
- # speech above threshold
- #thres = N.percentile([val for val in sadval if val > 0], 25)
- #sat = ([i for i, val in enumerate(sadval) if val > thres], 'speechthreshold25p')
-
-# for frameids, name in [bgd, sgd, sat]:
- for frameids, name in [sgd]:
-
- gdiff = gauss_div(mfcc[frameids,:], gdiff_win_size_frame)
-
- # debug purpose only
- # res = self.new_result(data_mode='value', time_mode='event')
- # res.id_metadata.id += ('.%s' % name)
- # res.id_metadata.name += (' %s' % name)
- # res.data_object.value = N.array(gdiff)
- # res.data_object.time = N.array(frameids[gdiff_win_size_frame:(1-gdiff_win_size_frame)]) * timestepsize
- # self.process_pipe.results.add(res)
-
- seg = segment(gdiff, min_seg_size_frame)
- # seg_result = self.new_result(data_mode='value', time_mode='event')
- # seg_result.id_metadata.id += '.' + name + 'segchop'
- # seg_result.id_metadata.name += ' ' + name + 'seg chop'
- # seg_result.data_object.value = N.array(seg)
- # seg_result.data_object.time = N.array(frameids[gdiff_win_size_frame:(1-gdiff_win_size_frame)]) * timestepsize
- # self.process_pipe.results.add(seg_result)
-
- # build pyannote annotation
- # print 'building annotation'
- #b = time.time()
- chunks = Annotation()
- fbegin = None
- #fend = None
- lastframe = None
- ichunk = 0
- for segval, iframe in zip(seg, frameids):
- if segval == 1:
- if lastframe is not None:
- chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, iframe-fbegin)] = str(ichunk)
- ichunk += 1
- fbegin= iframe
- elif iframe -1 != lastframe:
- if lastframe is not None:
- chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, lastframe-fbegin+1)] = str(ichunk)
- fbegin= iframe
- lastframe = iframe
- if lastframe != fbegin:
- chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, lastframe-fbegin+1)] = str(ichunk)
- # print 'chunks', chunks
- #print 'anotation build in', (time.time() - b)
-
- bicClustering = BICClustering(covariance_type='full', penalty_coef=self.bic_penalty_coeff)
- hypothesis = bicClustering(chunks, feature=pyannotefeat)
-
- # gen result interval
- #print 'gen result interval'
- diar_res = self.new_result(data_mode='label', time_mode='segment')
- diar_res.id_metadata.id += '.' + 'speakers' # + name + 'diarisation'
- diar_res.id_metadata.name += ' ' + 'speaker identifiers' # name + 'diarisation'
-
- tmplabel = [int(h[2]) for h in hypothesis.itertracks(True)]
- tmptime = [h[0].start for h in hypothesis.itertracks()]
- tmpduration = [h[0].duration for h in hypothesis.itertracks()]
-
- label = []
- time = []
- duration = []
- lastlabel = None
-
- for l, t, d in zip(tmplabel, tmptime, tmpduration):
- if l != lastlabel:
- label.append(l)
- duration.append(d)
- time.append(t)
- else:
- duration[-1] = t + d - time[-1]
- lastlabel = l
-
-
- diar_res.data_object.label = label
- diar_res.data_object.time = time
- diar_res.data_object.duration = duration
- diar_res.data_object.label_metadata.label = dict()
- for lab in diar_res.data_object.label:
- diar_res.data_object.label_metadata.label[lab] = str(lab)
- # TODO FIXME
- # for h in hypothesis.itertracks(label=True):
- # diar_res.data_object.label.append(h[2])
- # diar_res.data_object.time.append(h[0].start)
- # diar_res.data_object.duration.apeend(h[0].duration)
- #sadres.data_object.value = sadval
- self.process_pipe.results.add(diar_res)
+
-
+ # speech activity detection
- sadval = self.process_pipe.results[self.sad_analyzer.id() + '.sad_lhh_diff'].data_object.value[:]
++ sadval = self.process_pipe.results.get_result_by_id(self.sad_analyzer.id() + '.sad_lhh_diff').data_object.value[:]
+ # indices of frames detected as speech
+ speech_threshold = 0.
+ frameids = [i for i, val in enumerate(sadval) if val > speech_threshold]
+
+ # compute gaussian divergence of speech frames only
+ gdiff = gauss_div(mfcc[frameids,:], gdiff_win_size_frame)
+
+ # initial segmentation based on gaussian divergence criterion
+ seg = segment(gdiff, min_seg_size_frame)
+
+ # Convert initial segmentation to pyannote annotation
+ chunks = Annotation()
+ fbegin = None
+
+ lastframe = None
+ ichunk = 0
+ for segval, iframe in zip(seg, frameids):
+ if segval == 1:
+ if lastframe is not None:
+ chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, iframe-fbegin)] = str(ichunk)
+ ichunk += 1
+ fbegin= iframe
+ elif iframe -1 != lastframe:
+ if lastframe is not None:
+ chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, lastframe-fbegin+1)] = str(ichunk)
+ fbegin= iframe
+ lastframe = iframe
+ if lastframe != fbegin:
+ chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, lastframe-fbegin+1)] = str(ichunk)
+
+
+ # performs BIC clustering
+ bicClustering = BICClustering(covariance_type='full', penalty_coef=self.bic_penalty_coeff)
+ hypothesis = bicClustering(chunks, feature=pyannotefeat)
+
+ # get diarisation results
+ tmplabel = [int(h[2]) for h in hypothesis.itertracks(True)]
+ tmptime = [h[0].start for h in hypothesis.itertracks()]
+ tmpduration = [h[0].duration for h in hypothesis.itertracks()]
+
+ # merge adjacent clusters having same labels
+ label = []
+ time = []
+ duration = []
+ lastlabel = None
+ for l, t, d in zip(tmplabel, tmptime, tmpduration):
+ if l != lastlabel:
+ label.append(l)
+ duration.append(d)
+ time.append(t)
+ else:
+ duration[-1] = t + d - time[-1]
+ lastlabel = l
+
+
+ # store diarisation result
+ diar_res = self.new_result(data_mode='label', time_mode='segment')
+ diar_res.id_metadata.id += '.' + 'speakers' # + name + 'diarisation'
+ diar_res.id_metadata.name += ' ' + 'speaker identifiers' # name + 'diarisation'
+ diar_res.data_object.label = label
+ diar_res.data_object.time = time
+ diar_res.data_object.duration = duration
+ diar_res.label_metadata.label = dict()
+ for lab in diar_res.data_object.label:
+ diar_res.label_metadata.label[lab] = str(lab)
+
+ self.process_pipe.results.add(diar_res)
"""
Parameters:
----------
- sad_model : string bellowing to 'etape' 'maya'
- alllows the selection of a SAD model:
- 'etape' is more suited to radionews material
- 'maya' is more suited to speech obtained in noisy environments
+
+ sad_model : string bellowing to ['etape', 'maya']
+ Allows the selection of trained speech activity detection models.
+ * 'etape' models were trained on data distributed in the framework of the
+ ETAPE campaign (http://www.afcp-parole.org/etape.html)
+ These models are suited for radionews material (0.974 AUC on Etape data)
+ * 'maya' models were obtained on data collected by EREA – Centre
+ Enseignement et Recherche en Ethnologie Amerindienne
+ These models are suited to speech obtained in noisy environments
+ (0.915 AUC on Maya data)
+
+
+ dews: dilatation and erosion window size (seconds)
+ This value correspond to the size in seconds of the sliding window
+ used to perform a dilation followed by an erosion procedure
+ these procedures consist to output the max (respectively the min) of the
+ speech detection estimate. The order of these procedures is aimed at removing
+ non-speech frames corresponding to fricatives or short pauses
+ The size of the windows correspond to the minimal size of the resulting
+ speech/non speech segments
+
+ speech_threshold: threshold used for speech/non speech decision
+ based on the log likelihood difference
+
+ dllh_bounds: raw log likelihood difference estimates will be bound
+ according this (min_llh_difference, max_llh_difference) tuple
+ Usefull for plotting log likelihood differences
+ if set to None, no bounding will be done
-
"""
super(LimsiSad, self).__init__()
# load gmm model
if sad_model not in ['etape', 'maya']:
- raise ValueError("argument sad_model %s not supported. Supported values are 'etape' or 'maya'" % sad_model)
- picfname = os.path.join(timeside.__path__[0], 'trained_models', 'limsi_sad_%s.pkl' % sad_model)
+ raise ValueError(
+ "argument sad_model %s not supported. Supported values are 'etape' or 'maya'" % sad_model)
+ picfname = os.path.join(
+ timeside.__path__[0], 'analyzer', 'trained_models', 'limsi_sad_%s.pkl' % sad_model)
self.gmms = pickle.load(open(picfname, 'rb'))
+ self.dews = dews
+ self.speech_threshold = speech_threshold
+ self.dllh_bounds = dllh_bounds
+
@staticmethod
@interfacedoc
def id():
return frames, eod
def post_process(self):
- mfcc = self.process_pipe.results['yaafe.mfcc']['data_object']['value']
- mfccd1 = self.process_pipe.results['yaafe.mfccd1']['data_object']['value']
- mfccd2 = self.process_pipe.results['yaafe.mfccd2']['data_object']['value']
- zcr = self.process_pipe.results['yaafe.zcr']['data_object']['value']
+ # extract signal features
-
- features = np.concatenate((mfcc, mfccd1, mfccd2, zcr), axis=1)
-
- res = 0.5 + 0.5 * \
- (self.gmms[0].llh(features) - self.gmms[1].llh(features))
-
+ yaafe_result = self.process_pipe.results[self.parents['yaafe'].uuid()]
+ mfcc = yaafe_result['yaafe.mfcc']['data_object']['value']
+ mfccd1 = yaafe_result['yaafe.mfccd1']['data_object']['value']
+ mfccd2 = yaafe_result['yaafe.mfccd2']['data_object']['value']
+ zcr = yaafe_result['yaafe.zcr']['data_object']['value']
+ features = N.concatenate((mfcc, mfccd1, mfccd2, zcr), axis=1)
+
+ # compute log likelihood difference
+ res = 0.5 + 0.5 * (self.gmms[0].llh(features) - self.gmms[1].llh(features))
+
+ # bounds log likelihood difference
+ if self.dllh_bounds is not None:
+ mindiff, maxdiff = self.dllh_bounds
+ res = N.minimum(N.maximum(res, mindiff), maxdiff)
+
+ # performs dilation, erosion, erosion, dilatation
+ ws = int(self.dews * float(self.input_samplerate ) / self.input_stepsize)
+ deed_llh = dilatation(erosion(erosion(dilatation(res, ws), ws), ws), ws)
+
+ # infer speech and non speech segments from dilated
+ # and erroded likelihood difference estimate
+ last = None
+ labels = []
+ times = []
+ durations = []
+ for i, val in enumerate([1 if e > self.speech_threshold else 0 for e in deed_llh]):
+ if val != last:
+ labels.append(val)
+ durations.append(1)
+ times.append(i)
+ else:
+ durations[-1] += 1
+ last = val
+ times = [(float(e) * self.input_stepsize) / self.input_samplerate for e in times]
+ durations = [(float(e) * self.input_stepsize) / self.input_samplerate for e in durations]
+
+
+ # outputs the raw frame level speech/non speech log likelihood difference
sad_result = self.new_result(data_mode='value', time_mode='framewise')
sad_result.id_metadata.id += '.' + 'sad_lhh_diff'
- sad_result.id_metadata.name += ' ' + \
- 'Speech Activity Detection Log Likelihood Difference'
+ sad_result.id_metadata.name += ' ' + 'Speech Activity Detection Log Likelihood Difference'
sad_result.data_object.value = res
- self.add_result(sad_result)
+ self.process_pipe.results.add(sad_result)
+
+ # outputs frame level speech/non speech log likelihood difference
+ # altered with erosion and dilatation procedures
+ sad_de_result = self.new_result(data_mode='value', time_mode='framewise')
+ sad_de_result.id_metadata.id += '.' + 'sad_de_lhh_diff'
+ sad_de_result.id_metadata.name += ' ' + 'Speech Activity Detection Log Likelihood Difference | dilat | erode'
+ sad_de_result.data_object.value = deed_llh
+ self.process_pipe.results.add(sad_de_result)
+
+ # outputs speech/non speech segments
+ sad_seg_result = self.new_result(data_mode='label', time_mode='segment')
+ sad_seg_result.id_metadata.id += '.' + 'sad_segments'
+ sad_seg_result.id_metadata.name += ' ' + 'Speech Activity Detection Segments'
+ sad_seg_result.data_object.label = labels
+ sad_seg_result.data_object.time = times
+ sad_seg_result.data_object.duration = durations
+ sad_seg_result.label_metadata.label = {0: 'Not Speech', 1: 'Speech'}
+
+ self.process_pipe.results.add(sad_seg_result)