Merge remote-tracking branch 'upstream/diadems' into diadems

author David Doukhan <david.doukhan@gmail.com>

Fri, 26 Sep 2014 14:03:05 +0000 (16:03 +0200)

committer David Doukhan <david.doukhan@gmail.com>

Fri, 26 Sep 2014 14:03:05 +0000 (16:03 +0200)
author David Doukhan <david.doukhan@gmail.com>
Fri, 26 Sep 2014 14:03:05 +0000 (16:03 +0200)
committer David Doukhan <david.doukhan@gmail.com>
Fri, 26 Sep 2014 14:03:05 +0000 (16:03 +0200)
diff --cc timeside/analyzer/core.py
Simple merge
diff --cc timeside/analyzer/limsi_diarization.py

index fd2cb53b371d28a3b9b2b175f4ce5abdab942260,613dbf8bc00370c93a24cf7dd0a392e66041e9b2..623e3a2b87498e908f120ce29014fd19e8150fd8
--- 1/timeside/analyzer/limsi_diarization.py
--- 2/timeside/analyzer/limsi_diarization.py
+++ b/timeside/analyzer/limsi_diarization.py
@@@ -23,10 -23,10 +23,9 @@@
   from timeside.core import implements, interfacedoc
   from timeside.analyzer.core import Analyzer
   from timeside.api import IAnalyzer
--# from timeside.analyzer import IRITSpeech4Hz
   from yaafe import Yaafe
   import yaafelib
- from timeside.analyzer import LimsiSad
+ from timeside.analyzer.limsi_sad import LimsiSad
   import numpy as N
   import sys
   
@@@ -116,85 -117,119 +115,83 @@@ class LimsiDiarization(Analyzer)
           return frames, eod
   
       def post_process(self):
-         
+ +        # extract mfcc with yaafe and store them to be used with pyannote
-         mfcc = self.process_pipe.results['yaafe.mfccchop']['data_object']['value']
-         sw = YaafeFrame(self.input_blocksize, self.input_stepsize, self.input_samplerate)
-         pyannotefeat = SlidingWindowFeature(mfcc, sw)     
+         mfcc = self.process_pipe.results.get_result_by_id('yaafe.mfccchop')['data_object']['value']
   
+         sw = YaafeFrame(self.input_blocksize, self.input_stepsize, self.input_samplerate)
+         pyannotefeat = SlidingWindowFeature(mfcc, sw)
   
- -        # speech activity detection: usefull for debugging purpose only
- -        # print 'adding sad res to result'
- -        # sadres = self.new_result(data_mode='value', time_mode='framewise')
- -        # sadres.id_metadata.id += '.' + 'sadres'
- -        # sadres.id_metadata.name += ' ' + 'SAD RESULT'
- -        sadval = self.process_pipe.results.get_result_by_id(self.sad_analyzer.id() + '.sad_lhh_diff').data_object.value[:]
- -        # sadres.data_object.value = sadval
- -        # self.process_pipe.results.add(sadres)
- -
           # gaussian divergence window size
- -        #winframesize = int((self.input_samplerate / self.input_stepsize) * self.chop_window_size)
           timestepsize = self.input_stepsize / float(self.input_samplerate)
           gdiff_win_size_frame = int(self.gdiff_win_size_sec / timestepsize)
           min_seg_size_frame = int(self.min_seg_size_sec / timestepsize)
- -        # print 'timestepsize %d, gdiffwinsize (sec: %f, frame: %d) , minsegsize (sec: %f, frame: %d)' % (timestepsize, self.gdiff_win_size_sec, gdiff_win_size_frame, self.min_seg_size_sec, min_seg_size_frame)
- -
- -
- -        # basic gauss div
- -        #bgd = (range(0, len(mfcc)), 'basicgdiv')
- -        # speech gauss div
- -        sgd = ([i for i, val in enumerate(sadval) if val > 0], 'speechgdiv')
- -        # speech above threshold
- -        #thres = N.percentile([val for val in sadval if val > 0], 25)
- -        #sat = ([i for i, val in enumerate(sadval) if val > thres], 'speechthreshold25p')
- -
- -#        for frameids, name in [bgd, sgd, sat]:
- -        for frameids, name in [sgd]:
- -
- -            gdiff = gauss_div(mfcc[frameids,:], gdiff_win_size_frame)
- -
- -            # debug purpose only
- -            # res = self.new_result(data_mode='value', time_mode='event')
- -            # res.id_metadata.id += ('.%s' % name)
- -            # res.id_metadata.name += (' %s' % name)
- -            # res.data_object.value = N.array(gdiff)
- -            # res.data_object.time = N.array(frameids[gdiff_win_size_frame:(1-gdiff_win_size_frame)]) * timestepsize
- -            # self.process_pipe.results.add(res)
- -
- -            seg = segment(gdiff, min_seg_size_frame)
- -            # seg_result = self.new_result(data_mode='value', time_mode='event')
- -            # seg_result.id_metadata.id += '.' + name + 'segchop'
- -            # seg_result.id_metadata.name += ' ' + name + 'seg chop'
- -            # seg_result.data_object.value = N.array(seg)
- -            # seg_result.data_object.time = N.array(frameids[gdiff_win_size_frame:(1-gdiff_win_size_frame)]) * timestepsize
- -            # self.process_pipe.results.add(seg_result)
- -
- -            # build pyannote annotation
- -            # print 'building annotation'
- -            #b = time.time()
- -            chunks = Annotation()
- -            fbegin = None
- -            #fend = None
- -            lastframe = None
- -            ichunk = 0
- -            for segval, iframe in zip(seg, frameids):
- -                if segval == 1:
- -                    if lastframe is not None:
- -                        chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, iframe-fbegin)] = str(ichunk)
- -                        ichunk += 1
- -                    fbegin= iframe
- -                elif iframe -1 != lastframe:
- -                    if lastframe is not None:
- -                        chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, lastframe-fbegin+1)] = str(ichunk)
- -                    fbegin= iframe
- -                lastframe = iframe
- -            if lastframe != fbegin:
- -                chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, lastframe-fbegin+1)] = str(ichunk)
- -            # print 'chunks', chunks
- -            #print 'anotation build in', (time.time() - b)
- -
- -            bicClustering = BICClustering(covariance_type='full', penalty_coef=self.bic_penalty_coeff)
- -            hypothesis = bicClustering(chunks, feature=pyannotefeat)
- -
- -            # gen result interval
- -            #print 'gen result interval'
- -            diar_res = self.new_result(data_mode='label', time_mode='segment')
- -            diar_res.id_metadata.id += '.' + 'speakers' # + name + 'diarisation'
- -            diar_res.id_metadata.name += ' ' + 'speaker identifiers' # name + 'diarisation'
- -
- -            tmplabel = [int(h[2]) for h in hypothesis.itertracks(True)]
- -            tmptime = [h[0].start for h in hypothesis.itertracks()]
- -            tmpduration = [h[0].duration for h in hypothesis.itertracks()]
- -
- -            label = []
- -            time = []
- -            duration = []
- -            lastlabel = None
- -
- -            for l, t, d in zip(tmplabel, tmptime, tmpduration):
- -                if l != lastlabel:
- -                    label.append(l)
- -                    duration.append(d)
- -                    time.append(t)
- -                else:
- -                    duration[-1] = t + d - time[-1]
- -                lastlabel = l
- -
- -
- -            diar_res.data_object.label = label
- -            diar_res.data_object.time = time
- -            diar_res.data_object.duration = duration
- -            diar_res.data_object.label_metadata.label = dict()
- -            for lab in diar_res.data_object.label:
- -                diar_res.data_object.label_metadata.label[lab] = str(lab)
- -            # TODO FIXME
- -            # for h in hypothesis.itertracks(label=True):
- -            #     diar_res.data_object.label.append(h[2])
- -            #     diar_res.data_object.time.append(h[0].start)
- -            #     diar_res.data_object.duration.apeend(h[0].duration)
- -            #sadres.data_object.value = sadval
- -            self.process_pipe.results.add(diar_res)
+ +
- 
+ +        # speech activity detection
-         sadval = self.process_pipe.results[self.sad_analyzer.id() + '.sad_lhh_diff'].data_object.value[:]
++        sadval = self.process_pipe.results.get_result_by_id(self.sad_analyzer.id() + '.sad_lhh_diff').data_object.value[:]
+ +        # indices of frames detected as speech
+ +        speech_threshold = 0.
+ +        frameids = [i for i, val in enumerate(sadval) if val > speech_threshold]
+ +
+ +        # compute gaussian divergence of speech frames only
+ +        gdiff = gauss_div(mfcc[frameids,:], gdiff_win_size_frame)
+ +
+ +        # initial segmentation based on gaussian divergence criterion
+ +        seg = segment(gdiff, min_seg_size_frame)
+ +
+ +        # Convert initial segmentation to pyannote annotation
+ +        chunks = Annotation()
+ +        fbegin = None
+ +
+ +        lastframe = None
+ +        ichunk = 0
+ +        for segval, iframe in zip(seg, frameids):
+ +            if segval == 1:
+ +                if lastframe is not None:
+ +                    chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, iframe-fbegin)] = str(ichunk)
+ +                    ichunk += 1
+ +                fbegin= iframe
+ +            elif iframe -1 != lastframe:
+ +                if lastframe is not None:
+ +                    chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, lastframe-fbegin+1)] = str(ichunk)
+ +                fbegin= iframe
+ +            lastframe = iframe
+ +        if lastframe != fbegin:
+ +            chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, lastframe-fbegin+1)] = str(ichunk)
+ +
+ +
+ +        # performs BIC clustering
+ +        bicClustering = BICClustering(covariance_type='full', penalty_coef=self.bic_penalty_coeff)
+ +        hypothesis = bicClustering(chunks, feature=pyannotefeat)
+ +
+ +        # get diarisation results
+ +        tmplabel = [int(h[2]) for h in hypothesis.itertracks(True)]
+ +        tmptime = [h[0].start for h in hypothesis.itertracks()]
+ +        tmpduration = [h[0].duration for h in hypothesis.itertracks()]
+ +
+ +        # merge adjacent clusters having same labels
+ +        label = []
+ +        time = []
+ +        duration = []
+ +        lastlabel = None
+ +        for l, t, d in zip(tmplabel, tmptime, tmpduration):
+ +            if l != lastlabel:
+ +                label.append(l)
+ +                duration.append(d)
+ +                time.append(t)
+ +            else:
+ +                duration[-1] = t + d - time[-1]
+ +            lastlabel = l
+ +
+ +            
+ +        # store diarisation result
+ +        diar_res = self.new_result(data_mode='label', time_mode='segment')
+ +        diar_res.id_metadata.id += '.' + 'speakers' # + name + 'diarisation'
+ +        diar_res.id_metadata.name += ' ' + 'speaker identifiers' # name + 'diarisation'
+ +        diar_res.data_object.label = label
+ +        diar_res.data_object.time = time
+ +        diar_res.data_object.duration = duration
+ +        diar_res.label_metadata.label = dict()
+ +        for lab in diar_res.data_object.label:
+ +            diar_res.label_metadata.label[lab] = str(lab)
+ +            
+ +        self.process_pipe.results.add(diar_res)
diff --cc timeside/analyzer/limsi_sad.py

index 4d58a84102a116b9026f40199b71f3c293abebac,ba18b83dfba5f1e52e06a127930adc060a894b04..249c785edf23d3405ecf55b3bbf2dcaf988b3cc8
--- 1/timeside/analyzer/limsi_sad.py
--- 2/timeside/analyzer/limsi_sad.py
+++ b/timeside/analyzer/limsi_sad.py
@@@ -105,35 -66,10 +105,34 @@@ class LimsiSad(Analyzer)
           """
           Parameters:
           ----------
- -        sad_model : string bellowing to 'etape' 'maya'
- -        alllows the selection of a SAD model:
- -        'etape' is more suited to radionews material
- -        'maya' is more suited to speech obtained in noisy environments
+ +
+ +        sad_model : string bellowing to ['etape', 'maya']
+ +          Allows the selection of trained speech activity detection models.
+ +          * 'etape' models were trained on data distributed in the framework of the
+ +            ETAPE campaign (http://www.afcp-parole.org/etape.html)
+ +            These models are suited for radionews material (0.974 AUC on Etape data)
+ +          * 'maya' models were obtained on data collected by EREA – Centre
+ +            Enseignement et Recherche en Ethnologie Amerindienne
+ +            These models are suited to speech obtained in noisy environments
+ +            (0.915 AUC on Maya data)
+ +
+ +
+ +        dews: dilatation and erosion window size (seconds)
+ +          This value correspond to the size in seconds of the sliding window
+ +          used to perform a dilation followed by an erosion procedure
+ +          these procedures consist to output the max (respectively the min) of the
+ +          speech detection estimate. The order of these procedures is aimed at removing
+ +          non-speech frames corresponding to fricatives or short pauses
+ +          The size of the windows correspond to the minimal size of the resulting
+ +          speech/non speech segments
+ +
+ +        speech_threshold: threshold used for speech/non speech decision
+ +          based on the log likelihood difference
+ +
+ +        dllh_bounds: raw log likelihood difference estimates will be bound
+ +          according this (min_llh_difference, max_llh_difference) tuple
+ +          Usefull for plotting log likelihood differences
+ +          if set to None, no bounding will be done
- 
           """
           super(LimsiSad, self).__init__()
   
@@@ -154,14 -93,12 +156,16 @@@
   
           # load gmm model
           if sad_model not in ['etape', 'maya']:
-             raise ValueError("argument sad_model %s not supported. Supported values are 'etape' or 'maya'" % sad_model)
-         picfname = os.path.join(timeside.__path__[0], 'trained_models', 'limsi_sad_%s.pkl' % sad_model)
+             raise ValueError(
+                 "argument sad_model %s not supported. Supported values are 'etape' or 'maya'" % sad_model)
+         picfname = os.path.join(
+             timeside.__path__[0], 'analyzer', 'trained_models', 'limsi_sad_%s.pkl' % sad_model)
           self.gmms = pickle.load(open(picfname, 'rb'))
   
+ +        self.dews = dews
+ +        self.speech_threshold = speech_threshold
+ +        self.dllh_bounds = dllh_bounds
+ +
       @staticmethod
       @interfacedoc
       def id():
@@@ -182,67 -123,20 +190,66 @@@
           return frames, eod
   
       def post_process(self):
-         mfcc = self.process_pipe.results['yaafe.mfcc']['data_object']['value']
-         mfccd1 = self.process_pipe.results['yaafe.mfccd1']['data_object']['value']
-         mfccd2 = self.process_pipe.results['yaafe.mfccd2']['data_object']['value']
-         zcr = self.process_pipe.results['yaafe.zcr']['data_object']['value']       
+ +        # extract signal features
- -
- -        features = np.concatenate((mfcc, mfccd1, mfccd2, zcr), axis=1)
- -
- -        res = 0.5 + 0.5 * \
- -            (self.gmms[0].llh(features) - self.gmms[1].llh(features))
- -
+         yaafe_result = self.process_pipe.results[self.parents['yaafe'].uuid()]
+         mfcc = yaafe_result['yaafe.mfcc']['data_object']['value']
+         mfccd1 = yaafe_result['yaafe.mfccd1']['data_object']['value']
+         mfccd2 = yaafe_result['yaafe.mfccd2']['data_object']['value']
+         zcr = yaafe_result['yaafe.zcr']['data_object']['value']
+ +        features = N.concatenate((mfcc, mfccd1, mfccd2, zcr), axis=1)
+ +
+ +        # compute log likelihood difference
+ +        res = 0.5 + 0.5 * (self.gmms[0].llh(features) - self.gmms[1].llh(features))
+ +
+ +        # bounds log likelihood difference
+ +        if self.dllh_bounds is not None:
+ +            mindiff, maxdiff = self.dllh_bounds
+ +            res = N.minimum(N.maximum(res,  mindiff), maxdiff)
+ +
+ +        # performs dilation, erosion, erosion, dilatation
+ +        ws = int(self.dews * float(self.input_samplerate ) / self.input_stepsize)
+ +        deed_llh = dilatation(erosion(erosion(dilatation(res, ws), ws), ws), ws)
+ +
+ +        # infer speech and non speech segments from dilated
+ +        # and erroded likelihood difference estimate
+ +        last = None
+ +        labels = []
+ +        times = []
+ +        durations = []
+ +        for i, val in enumerate([1 if e > self.speech_threshold else 0 for e in deed_llh]):
+ +            if val != last:
+ +                labels.append(val)
+ +                durations.append(1)
+ +                times.append(i)
+ +            else:
+ +                durations[-1] += 1
+ +            last = val
+ +        times = [(float(e) * self.input_stepsize) / self.input_samplerate for e in times]
+ +        durations = [(float(e) * self.input_stepsize) / self.input_samplerate for e in durations]
+ +
+ +
+ +        # outputs the raw frame level speech/non speech log likelihood difference
           sad_result = self.new_result(data_mode='value', time_mode='framewise')
           sad_result.id_metadata.id += '.' + 'sad_lhh_diff'
- -        sad_result.id_metadata.name += ' ' + \
- -            'Speech Activity Detection Log Likelihood Difference'
+ +        sad_result.id_metadata.name += ' ' + 'Speech Activity Detection Log Likelihood Difference'
           sad_result.data_object.value = res
- -        self.add_result(sad_result)
+ +        self.process_pipe.results.add(sad_result)
+ +
+ +        # outputs frame level speech/non speech log likelihood difference
+ +        # altered with erosion and dilatation procedures
+ +        sad_de_result = self.new_result(data_mode='value', time_mode='framewise')
+ +        sad_de_result.id_metadata.id += '.' + 'sad_de_lhh_diff'
+ +        sad_de_result.id_metadata.name += ' ' + 'Speech Activity Detection Log Likelihood Difference | dilat | erode'
+ +        sad_de_result.data_object.value = deed_llh
+ +        self.process_pipe.results.add(sad_de_result)
+ +
+ +        # outputs speech/non speech segments
+ +        sad_seg_result = self.new_result(data_mode='label', time_mode='segment')
+ +        sad_seg_result.id_metadata.id += '.' + 'sad_segments'
+ +        sad_seg_result.id_metadata.name += ' ' + 'Speech Activity Detection Segments'
+ +        sad_seg_result.data_object.label = labels
+ +        sad_seg_result.data_object.time = times
+ +        sad_seg_result.data_object.duration = durations
+ +        sad_seg_result.label_metadata.label = {0: 'Not Speech', 1: 'Speech'}
+ +
+ +        self.process_pipe.results.add(sad_seg_result)
author	David Doukhan <david.doukhan@gmail.com>
	Fri, 26 Sep 2014 14:03:05 +0000 (16:03 +0200)
committer	David Doukhan <david.doukhan@gmail.com>
	Fri, 26 Sep 2014 14:03:05 +0000 (16:03 +0200)
		1	2
timeside/analyzer/core.py	patch \|	diff1 \|	diff2 \|	blob \| history
timeside/analyzer/limsi_diarization.py	patch \|	diff1 \|	diff2 \|	blob \| history
timeside/analyzer/limsi_sad.py	patch \|	diff1 \|	diff2 \|	blob \| history