From: David Doukhan Date: Fri, 26 Sep 2014 14:03:05 +0000 (+0200) Subject: Merge remote-tracking branch 'upstream/diadems' into diadems X-Git-Url: https://git.parisson.com/?a=commitdiff_plain;h=34c75d69383596be8843cc5c351102ce2c3780c6;p=timeside.git Merge remote-tracking branch 'upstream/diadems' into diadems Conflicts: timeside/analyzer/limsi_diarization.py timeside/analyzer/limsi_sad.py --- 34c75d69383596be8843cc5c351102ce2c3780c6 diff --cc timeside/analyzer/limsi_diarization.py index fd2cb53,613dbf8..623e3a2 --- a/timeside/analyzer/limsi_diarization.py +++ b/timeside/analyzer/limsi_diarization.py @@@ -23,10 -23,10 +23,9 @@@ from timeside.core import implements, interfacedoc from timeside.analyzer.core import Analyzer from timeside.api import IAnalyzer --# from timeside.analyzer import IRITSpeech4Hz from yaafe import Yaafe import yaafelib - from timeside.analyzer import LimsiSad + from timeside.analyzer.limsi_sad import LimsiSad import numpy as N import sys @@@ -116,85 -117,119 +115,83 @@@ class LimsiDiarization(Analyzer) return frames, eod def post_process(self): - + # extract mfcc with yaafe and store them to be used with pyannote - mfcc = self.process_pipe.results['yaafe.mfccchop']['data_object']['value'] - sw = YaafeFrame(self.input_blocksize, self.input_stepsize, self.input_samplerate) - pyannotefeat = SlidingWindowFeature(mfcc, sw) + mfcc = self.process_pipe.results.get_result_by_id('yaafe.mfccchop')['data_object']['value'] + sw = YaafeFrame(self.input_blocksize, self.input_stepsize, self.input_samplerate) + pyannotefeat = SlidingWindowFeature(mfcc, sw) - # speech activity detection: usefull for debugging purpose only - # print 'adding sad res to result' - # sadres = self.new_result(data_mode='value', time_mode='framewise') - # sadres.id_metadata.id += '.' + 'sadres' - # sadres.id_metadata.name += ' ' + 'SAD RESULT' - sadval = self.process_pipe.results.get_result_by_id(self.sad_analyzer.id() + '.sad_lhh_diff').data_object.value[:] - # sadres.data_object.value = sadval - # self.process_pipe.results.add(sadres) - # gaussian divergence window size - #winframesize = int((self.input_samplerate / self.input_stepsize) * self.chop_window_size) timestepsize = self.input_stepsize / float(self.input_samplerate) gdiff_win_size_frame = int(self.gdiff_win_size_sec / timestepsize) min_seg_size_frame = int(self.min_seg_size_sec / timestepsize) - # print 'timestepsize %d, gdiffwinsize (sec: %f, frame: %d) , minsegsize (sec: %f, frame: %d)' % (timestepsize, self.gdiff_win_size_sec, gdiff_win_size_frame, self.min_seg_size_sec, min_seg_size_frame) - - - # basic gauss div - #bgd = (range(0, len(mfcc)), 'basicgdiv') - # speech gauss div - sgd = ([i for i, val in enumerate(sadval) if val > 0], 'speechgdiv') - # speech above threshold - #thres = N.percentile([val for val in sadval if val > 0], 25) - #sat = ([i for i, val in enumerate(sadval) if val > thres], 'speechthreshold25p') - -# for frameids, name in [bgd, sgd, sat]: - for frameids, name in [sgd]: - - gdiff = gauss_div(mfcc[frameids,:], gdiff_win_size_frame) - - # debug purpose only - # res = self.new_result(data_mode='value', time_mode='event') - # res.id_metadata.id += ('.%s' % name) - # res.id_metadata.name += (' %s' % name) - # res.data_object.value = N.array(gdiff) - # res.data_object.time = N.array(frameids[gdiff_win_size_frame:(1-gdiff_win_size_frame)]) * timestepsize - # self.process_pipe.results.add(res) - - seg = segment(gdiff, min_seg_size_frame) - # seg_result = self.new_result(data_mode='value', time_mode='event') - # seg_result.id_metadata.id += '.' + name + 'segchop' - # seg_result.id_metadata.name += ' ' + name + 'seg chop' - # seg_result.data_object.value = N.array(seg) - # seg_result.data_object.time = N.array(frameids[gdiff_win_size_frame:(1-gdiff_win_size_frame)]) * timestepsize - # self.process_pipe.results.add(seg_result) - - # build pyannote annotation - # print 'building annotation' - #b = time.time() - chunks = Annotation() - fbegin = None - #fend = None - lastframe = None - ichunk = 0 - for segval, iframe in zip(seg, frameids): - if segval == 1: - if lastframe is not None: - chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, iframe-fbegin)] = str(ichunk) - ichunk += 1 - fbegin= iframe - elif iframe -1 != lastframe: - if lastframe is not None: - chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, lastframe-fbegin+1)] = str(ichunk) - fbegin= iframe - lastframe = iframe - if lastframe != fbegin: - chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, lastframe-fbegin+1)] = str(ichunk) - # print 'chunks', chunks - #print 'anotation build in', (time.time() - b) - - bicClustering = BICClustering(covariance_type='full', penalty_coef=self.bic_penalty_coeff) - hypothesis = bicClustering(chunks, feature=pyannotefeat) - - # gen result interval - #print 'gen result interval' - diar_res = self.new_result(data_mode='label', time_mode='segment') - diar_res.id_metadata.id += '.' + 'speakers' # + name + 'diarisation' - diar_res.id_metadata.name += ' ' + 'speaker identifiers' # name + 'diarisation' - - tmplabel = [int(h[2]) for h in hypothesis.itertracks(True)] - tmptime = [h[0].start for h in hypothesis.itertracks()] - tmpduration = [h[0].duration for h in hypothesis.itertracks()] - - label = [] - time = [] - duration = [] - lastlabel = None - - for l, t, d in zip(tmplabel, tmptime, tmpduration): - if l != lastlabel: - label.append(l) - duration.append(d) - time.append(t) - else: - duration[-1] = t + d - time[-1] - lastlabel = l - - - diar_res.data_object.label = label - diar_res.data_object.time = time - diar_res.data_object.duration = duration - diar_res.data_object.label_metadata.label = dict() - for lab in diar_res.data_object.label: - diar_res.data_object.label_metadata.label[lab] = str(lab) - # TODO FIXME - # for h in hypothesis.itertracks(label=True): - # diar_res.data_object.label.append(h[2]) - # diar_res.data_object.time.append(h[0].start) - # diar_res.data_object.duration.apeend(h[0].duration) - #sadres.data_object.value = sadval - self.process_pipe.results.add(diar_res) + - + # speech activity detection - sadval = self.process_pipe.results[self.sad_analyzer.id() + '.sad_lhh_diff'].data_object.value[:] ++ sadval = self.process_pipe.results.get_result_by_id(self.sad_analyzer.id() + '.sad_lhh_diff').data_object.value[:] + # indices of frames detected as speech + speech_threshold = 0. + frameids = [i for i, val in enumerate(sadval) if val > speech_threshold] + + # compute gaussian divergence of speech frames only + gdiff = gauss_div(mfcc[frameids,:], gdiff_win_size_frame) + + # initial segmentation based on gaussian divergence criterion + seg = segment(gdiff, min_seg_size_frame) + + # Convert initial segmentation to pyannote annotation + chunks = Annotation() + fbegin = None + + lastframe = None + ichunk = 0 + for segval, iframe in zip(seg, frameids): + if segval == 1: + if lastframe is not None: + chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, iframe-fbegin)] = str(ichunk) + ichunk += 1 + fbegin= iframe + elif iframe -1 != lastframe: + if lastframe is not None: + chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, lastframe-fbegin+1)] = str(ichunk) + fbegin= iframe + lastframe = iframe + if lastframe != fbegin: + chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, lastframe-fbegin+1)] = str(ichunk) + + + # performs BIC clustering + bicClustering = BICClustering(covariance_type='full', penalty_coef=self.bic_penalty_coeff) + hypothesis = bicClustering(chunks, feature=pyannotefeat) + + # get diarisation results + tmplabel = [int(h[2]) for h in hypothesis.itertracks(True)] + tmptime = [h[0].start for h in hypothesis.itertracks()] + tmpduration = [h[0].duration for h in hypothesis.itertracks()] + + # merge adjacent clusters having same labels + label = [] + time = [] + duration = [] + lastlabel = None + for l, t, d in zip(tmplabel, tmptime, tmpduration): + if l != lastlabel: + label.append(l) + duration.append(d) + time.append(t) + else: + duration[-1] = t + d - time[-1] + lastlabel = l + + + # store diarisation result + diar_res = self.new_result(data_mode='label', time_mode='segment') + diar_res.id_metadata.id += '.' + 'speakers' # + name + 'diarisation' + diar_res.id_metadata.name += ' ' + 'speaker identifiers' # name + 'diarisation' + diar_res.data_object.label = label + diar_res.data_object.time = time + diar_res.data_object.duration = duration + diar_res.label_metadata.label = dict() + for lab in diar_res.data_object.label: + diar_res.label_metadata.label[lab] = str(lab) + + self.process_pipe.results.add(diar_res) diff --cc timeside/analyzer/limsi_sad.py index 4d58a84,ba18b83..249c785 --- a/timeside/analyzer/limsi_sad.py +++ b/timeside/analyzer/limsi_sad.py @@@ -105,35 -66,10 +105,34 @@@ class LimsiSad(Analyzer) """ Parameters: ---------- - sad_model : string bellowing to 'etape' 'maya' - alllows the selection of a SAD model: - 'etape' is more suited to radionews material - 'maya' is more suited to speech obtained in noisy environments + + sad_model : string bellowing to ['etape', 'maya'] + Allows the selection of trained speech activity detection models. + * 'etape' models were trained on data distributed in the framework of the + ETAPE campaign (http://www.afcp-parole.org/etape.html) + These models are suited for radionews material (0.974 AUC on Etape data) + * 'maya' models were obtained on data collected by EREA – Centre + Enseignement et Recherche en Ethnologie Amerindienne + These models are suited to speech obtained in noisy environments + (0.915 AUC on Maya data) + + + dews: dilatation and erosion window size (seconds) + This value correspond to the size in seconds of the sliding window + used to perform a dilation followed by an erosion procedure + these procedures consist to output the max (respectively the min) of the + speech detection estimate. The order of these procedures is aimed at removing + non-speech frames corresponding to fricatives or short pauses + The size of the windows correspond to the minimal size of the resulting + speech/non speech segments + + speech_threshold: threshold used for speech/non speech decision + based on the log likelihood difference + + dllh_bounds: raw log likelihood difference estimates will be bound + according this (min_llh_difference, max_llh_difference) tuple + Usefull for plotting log likelihood differences + if set to None, no bounding will be done - """ super(LimsiSad, self).__init__() @@@ -154,14 -93,12 +156,16 @@@ # load gmm model if sad_model not in ['etape', 'maya']: - raise ValueError("argument sad_model %s not supported. Supported values are 'etape' or 'maya'" % sad_model) - picfname = os.path.join(timeside.__path__[0], 'trained_models', 'limsi_sad_%s.pkl' % sad_model) + raise ValueError( + "argument sad_model %s not supported. Supported values are 'etape' or 'maya'" % sad_model) + picfname = os.path.join( + timeside.__path__[0], 'analyzer', 'trained_models', 'limsi_sad_%s.pkl' % sad_model) self.gmms = pickle.load(open(picfname, 'rb')) + self.dews = dews + self.speech_threshold = speech_threshold + self.dllh_bounds = dllh_bounds + @staticmethod @interfacedoc def id(): @@@ -182,67 -123,20 +190,66 @@@ return frames, eod def post_process(self): + # extract signal features - mfcc = self.process_pipe.results['yaafe.mfcc']['data_object']['value'] - mfccd1 = self.process_pipe.results['yaafe.mfccd1']['data_object']['value'] - mfccd2 = self.process_pipe.results['yaafe.mfccd2']['data_object']['value'] - zcr = self.process_pipe.results['yaafe.zcr']['data_object']['value'] + yaafe_result = self.process_pipe.results[self.parents['yaafe'].uuid()] + mfcc = yaafe_result['yaafe.mfcc']['data_object']['value'] + mfccd1 = yaafe_result['yaafe.mfccd1']['data_object']['value'] + mfccd2 = yaafe_result['yaafe.mfccd2']['data_object']['value'] + zcr = yaafe_result['yaafe.zcr']['data_object']['value'] - - features = np.concatenate((mfcc, mfccd1, mfccd2, zcr), axis=1) - - res = 0.5 + 0.5 * \ - (self.gmms[0].llh(features) - self.gmms[1].llh(features)) - + features = N.concatenate((mfcc, mfccd1, mfccd2, zcr), axis=1) + + # compute log likelihood difference + res = 0.5 + 0.5 * (self.gmms[0].llh(features) - self.gmms[1].llh(features)) + + # bounds log likelihood difference + if self.dllh_bounds is not None: + mindiff, maxdiff = self.dllh_bounds + res = N.minimum(N.maximum(res, mindiff), maxdiff) + + # performs dilation, erosion, erosion, dilatation + ws = int(self.dews * float(self.input_samplerate ) / self.input_stepsize) + deed_llh = dilatation(erosion(erosion(dilatation(res, ws), ws), ws), ws) + + # infer speech and non speech segments from dilated + # and erroded likelihood difference estimate + last = None + labels = [] + times = [] + durations = [] + for i, val in enumerate([1 if e > self.speech_threshold else 0 for e in deed_llh]): + if val != last: + labels.append(val) + durations.append(1) + times.append(i) + else: + durations[-1] += 1 + last = val + times = [(float(e) * self.input_stepsize) / self.input_samplerate for e in times] + durations = [(float(e) * self.input_stepsize) / self.input_samplerate for e in durations] + + + # outputs the raw frame level speech/non speech log likelihood difference sad_result = self.new_result(data_mode='value', time_mode='framewise') sad_result.id_metadata.id += '.' + 'sad_lhh_diff' - sad_result.id_metadata.name += ' ' + \ - 'Speech Activity Detection Log Likelihood Difference' + sad_result.id_metadata.name += ' ' + 'Speech Activity Detection Log Likelihood Difference' sad_result.data_object.value = res - self.add_result(sad_result) + self.process_pipe.results.add(sad_result) + + # outputs frame level speech/non speech log likelihood difference + # altered with erosion and dilatation procedures + sad_de_result = self.new_result(data_mode='value', time_mode='framewise') + sad_de_result.id_metadata.id += '.' + 'sad_de_lhh_diff' + sad_de_result.id_metadata.name += ' ' + 'Speech Activity Detection Log Likelihood Difference | dilat | erode' + sad_de_result.data_object.value = deed_llh + self.process_pipe.results.add(sad_de_result) + + # outputs speech/non speech segments + sad_seg_result = self.new_result(data_mode='label', time_mode='segment') + sad_seg_result.id_metadata.id += '.' + 'sad_segments' + sad_seg_result.id_metadata.name += ' ' + 'Speech Activity Detection Segments' + sad_seg_result.data_object.label = labels + sad_seg_result.data_object.time = times + sad_seg_result.data_object.duration = durations + sad_seg_result.label_metadata.label = {0: 'Not Speech', 1: 'Speech'} + + self.process_pipe.results.add(sad_seg_result)