]> git.parisson.com Git - timeside.git/commitdiff
Merge remote-tracking branch 'upstream/diadems' into diadems
authorDavid Doukhan <david.doukhan@gmail.com>
Fri, 26 Sep 2014 14:03:05 +0000 (16:03 +0200)
committerDavid Doukhan <david.doukhan@gmail.com>
Fri, 26 Sep 2014 14:03:05 +0000 (16:03 +0200)
Conflicts:
timeside/analyzer/limsi_diarization.py
timeside/analyzer/limsi_sad.py

1  2 
timeside/analyzer/core.py
timeside/analyzer/limsi_diarization.py
timeside/analyzer/limsi_sad.py

Simple merge
index fd2cb53b371d28a3b9b2b175f4ce5abdab942260,613dbf8bc00370c93a24cf7dd0a392e66041e9b2..623e3a2b87498e908f120ce29014fd19e8150fd8
  from timeside.core import implements, interfacedoc
  from timeside.analyzer.core import Analyzer
  from timeside.api import IAnalyzer
--# from timeside.analyzer import IRITSpeech4Hz
  from yaafe import Yaafe
  import yaafelib
- from timeside.analyzer import LimsiSad
+ from timeside.analyzer.limsi_sad import LimsiSad
  import numpy as N
  import sys
  
@@@ -116,85 -117,119 +115,83 @@@ class LimsiDiarization(Analyzer)
          return frames, eod
  
      def post_process(self):
-         
 +        # extract mfcc with yaafe and store them to be used with pyannote
-         mfcc = self.process_pipe.results['yaafe.mfccchop']['data_object']['value']
-         sw = YaafeFrame(self.input_blocksize, self.input_stepsize, self.input_samplerate)
-         pyannotefeat = SlidingWindowFeature(mfcc, sw)     
+         mfcc = self.process_pipe.results.get_result_by_id('yaafe.mfccchop')['data_object']['value']
  
+         sw = YaafeFrame(self.input_blocksize, self.input_stepsize, self.input_samplerate)
+         pyannotefeat = SlidingWindowFeature(mfcc, sw)
  
 -        # speech activity detection: usefull for debugging purpose only
 -        # print 'adding sad res to result'
 -        # sadres = self.new_result(data_mode='value', time_mode='framewise')
 -        # sadres.id_metadata.id += '.' + 'sadres'
 -        # sadres.id_metadata.name += ' ' + 'SAD RESULT'
 -        sadval = self.process_pipe.results.get_result_by_id(self.sad_analyzer.id() + '.sad_lhh_diff').data_object.value[:]
 -        # sadres.data_object.value = sadval
 -        # self.process_pipe.results.add(sadres)
 -
          # gaussian divergence window size
 -        #winframesize = int((self.input_samplerate / self.input_stepsize) * self.chop_window_size)
          timestepsize = self.input_stepsize / float(self.input_samplerate)
          gdiff_win_size_frame = int(self.gdiff_win_size_sec / timestepsize)
          min_seg_size_frame = int(self.min_seg_size_sec / timestepsize)
 -        # print 'timestepsize %d, gdiffwinsize (sec: %f, frame: %d) , minsegsize (sec: %f, frame: %d)' % (timestepsize, self.gdiff_win_size_sec, gdiff_win_size_frame, self.min_seg_size_sec, min_seg_size_frame)
 -
 -
 -        # basic gauss div
 -        #bgd = (range(0, len(mfcc)), 'basicgdiv')
 -        # speech gauss div
 -        sgd = ([i for i, val in enumerate(sadval) if val > 0], 'speechgdiv')
 -        # speech above threshold
 -        #thres = N.percentile([val for val in sadval if val > 0], 25)
 -        #sat = ([i for i, val in enumerate(sadval) if val > thres], 'speechthreshold25p')
 -
 -#        for frameids, name in [bgd, sgd, sat]:
 -        for frameids, name in [sgd]:
 -
 -            gdiff = gauss_div(mfcc[frameids,:], gdiff_win_size_frame)
 -
 -            # debug purpose only
 -            # res = self.new_result(data_mode='value', time_mode='event')
 -            # res.id_metadata.id += ('.%s' % name)
 -            # res.id_metadata.name += (' %s' % name)
 -            # res.data_object.value = N.array(gdiff)
 -            # res.data_object.time = N.array(frameids[gdiff_win_size_frame:(1-gdiff_win_size_frame)]) * timestepsize
 -            # self.process_pipe.results.add(res)
 -
 -            seg = segment(gdiff, min_seg_size_frame)
 -            # seg_result = self.new_result(data_mode='value', time_mode='event')
 -            # seg_result.id_metadata.id += '.' + name + 'segchop'
 -            # seg_result.id_metadata.name += ' ' + name + 'seg chop'
 -            # seg_result.data_object.value = N.array(seg)
 -            # seg_result.data_object.time = N.array(frameids[gdiff_win_size_frame:(1-gdiff_win_size_frame)]) * timestepsize
 -            # self.process_pipe.results.add(seg_result)
 -
 -            # build pyannote annotation
 -            # print 'building annotation'
 -            #b = time.time()
 -            chunks = Annotation()
 -            fbegin = None
 -            #fend = None
 -            lastframe = None
 -            ichunk = 0
 -            for segval, iframe in zip(seg, frameids):
 -                if segval == 1:
 -                    if lastframe is not None:
 -                        chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, iframe-fbegin)] = str(ichunk)
 -                        ichunk += 1
 -                    fbegin= iframe
 -                elif iframe -1 != lastframe:
 -                    if lastframe is not None:
 -                        chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, lastframe-fbegin+1)] = str(ichunk)
 -                    fbegin= iframe
 -                lastframe = iframe
 -            if lastframe != fbegin:
 -                chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, lastframe-fbegin+1)] = str(ichunk)
 -            # print 'chunks', chunks
 -            #print 'anotation build in', (time.time() - b)
 -
 -            bicClustering = BICClustering(covariance_type='full', penalty_coef=self.bic_penalty_coeff)
 -            hypothesis = bicClustering(chunks, feature=pyannotefeat)
 -
 -            # gen result interval
 -            #print 'gen result interval'
 -            diar_res = self.new_result(data_mode='label', time_mode='segment')
 -            diar_res.id_metadata.id += '.' + 'speakers' # + name + 'diarisation'
 -            diar_res.id_metadata.name += ' ' + 'speaker identifiers' # name + 'diarisation'
 -
 -            tmplabel = [int(h[2]) for h in hypothesis.itertracks(True)]
 -            tmptime = [h[0].start for h in hypothesis.itertracks()]
 -            tmpduration = [h[0].duration for h in hypothesis.itertracks()]
 -
 -            label = []
 -            time = []
 -            duration = []
 -            lastlabel = None
 -
 -            for l, t, d in zip(tmplabel, tmptime, tmpduration):
 -                if l != lastlabel:
 -                    label.append(l)
 -                    duration.append(d)
 -                    time.append(t)
 -                else:
 -                    duration[-1] = t + d - time[-1]
 -                lastlabel = l
 -
 -
 -            diar_res.data_object.label = label
 -            diar_res.data_object.time = time
 -            diar_res.data_object.duration = duration
 -            diar_res.data_object.label_metadata.label = dict()
 -            for lab in diar_res.data_object.label:
 -                diar_res.data_object.label_metadata.label[lab] = str(lab)
 -            # TODO FIXME
 -            # for h in hypothesis.itertracks(label=True):
 -            #     diar_res.data_object.label.append(h[2])
 -            #     diar_res.data_object.time.append(h[0].start)
 -            #     diar_res.data_object.duration.apeend(h[0].duration)
 -            #sadres.data_object.value = sadval
 -            self.process_pipe.results.add(diar_res)
 +
 +        # speech activity detection
-         sadval = self.process_pipe.results[self.sad_analyzer.id() + '.sad_lhh_diff'].data_object.value[:]
++        sadval = self.process_pipe.results.get_result_by_id(self.sad_analyzer.id() + '.sad_lhh_diff').data_object.value[:]
 +        # indices of frames detected as speech
 +        speech_threshold = 0.
 +        frameids = [i for i, val in enumerate(sadval) if val > speech_threshold]
 +
 +        # compute gaussian divergence of speech frames only
 +        gdiff = gauss_div(mfcc[frameids,:], gdiff_win_size_frame)
 +
 +        # initial segmentation based on gaussian divergence criterion
 +        seg = segment(gdiff, min_seg_size_frame)
 +
 +        # Convert initial segmentation to pyannote annotation
 +        chunks = Annotation()
 +        fbegin = None
 +
 +        lastframe = None
 +        ichunk = 0
 +        for segval, iframe in zip(seg, frameids):
 +            if segval == 1:
 +                if lastframe is not None:
 +                    chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, iframe-fbegin)] = str(ichunk)
 +                    ichunk += 1
 +                fbegin= iframe
 +            elif iframe -1 != lastframe:
 +                if lastframe is not None:
 +                    chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, lastframe-fbegin+1)] = str(ichunk)
 +                fbegin= iframe
 +            lastframe = iframe
 +        if lastframe != fbegin:
 +            chunks[pyannotefeat.sliding_window.rangeToSegment(fbegin, lastframe-fbegin+1)] = str(ichunk)
 +
 +
 +        # performs BIC clustering
 +        bicClustering = BICClustering(covariance_type='full', penalty_coef=self.bic_penalty_coeff)
 +        hypothesis = bicClustering(chunks, feature=pyannotefeat)
 +
 +        # get diarisation results
 +        tmplabel = [int(h[2]) for h in hypothesis.itertracks(True)]
 +        tmptime = [h[0].start for h in hypothesis.itertracks()]
 +        tmpduration = [h[0].duration for h in hypothesis.itertracks()]
 +
 +        # merge adjacent clusters having same labels
 +        label = []
 +        time = []
 +        duration = []
 +        lastlabel = None
 +        for l, t, d in zip(tmplabel, tmptime, tmpduration):
 +            if l != lastlabel:
 +                label.append(l)
 +                duration.append(d)
 +                time.append(t)
 +            else:
 +                duration[-1] = t + d - time[-1]
 +            lastlabel = l
 +
 +            
 +        # store diarisation result
 +        diar_res = self.new_result(data_mode='label', time_mode='segment')
 +        diar_res.id_metadata.id += '.' + 'speakers' # + name + 'diarisation'
 +        diar_res.id_metadata.name += ' ' + 'speaker identifiers' # name + 'diarisation'
 +        diar_res.data_object.label = label
 +        diar_res.data_object.time = time
 +        diar_res.data_object.duration = duration
 +        diar_res.label_metadata.label = dict()
 +        for lab in diar_res.data_object.label:
 +            diar_res.label_metadata.label[lab] = str(lab)
 +            
 +        self.process_pipe.results.add(diar_res)
index 4d58a84102a116b9026f40199b71f3c293abebac,ba18b83dfba5f1e52e06a127930adc060a894b04..249c785edf23d3405ecf55b3bbf2dcaf988b3cc8
@@@ -105,35 -66,10 +105,34 @@@ class LimsiSad(Analyzer)
          """
          Parameters:
          ----------
 -        sad_model : string bellowing to 'etape' 'maya'
 -        alllows the selection of a SAD model:
 -        'etape' is more suited to radionews material
 -        'maya' is more suited to speech obtained in noisy environments
 +
 +        sad_model : string bellowing to ['etape', 'maya']
 +          Allows the selection of trained speech activity detection models.
 +          * 'etape' models were trained on data distributed in the framework of the
 +            ETAPE campaign (http://www.afcp-parole.org/etape.html)
 +            These models are suited for radionews material (0.974 AUC on Etape data)
 +          * 'maya' models were obtained on data collected by EREA – Centre
 +            Enseignement et Recherche en Ethnologie Amerindienne
 +            These models are suited to speech obtained in noisy environments
 +            (0.915 AUC on Maya data)
 +
 +
 +        dews: dilatation and erosion window size (seconds)
 +          This value correspond to the size in seconds of the sliding window
 +          used to perform a dilation followed by an erosion procedure
 +          these procedures consist to output the max (respectively the min) of the
 +          speech detection estimate. The order of these procedures is aimed at removing
 +          non-speech frames corresponding to fricatives or short pauses
 +          The size of the windows correspond to the minimal size of the resulting
 +          speech/non speech segments
 +
 +        speech_threshold: threshold used for speech/non speech decision
 +          based on the log likelihood difference
 +
 +        dllh_bounds: raw log likelihood difference estimates will be bound
 +          according this (min_llh_difference, max_llh_difference) tuple
 +          Usefull for plotting log likelihood differences
 +          if set to None, no bounding will be done
          """
          super(LimsiSad, self).__init__()
  
  
          # load gmm model
          if sad_model not in ['etape', 'maya']:
-             raise ValueError("argument sad_model %s not supported. Supported values are 'etape' or 'maya'" % sad_model)
-         picfname = os.path.join(timeside.__path__[0], 'trained_models', 'limsi_sad_%s.pkl' % sad_model)
+             raise ValueError(
+                 "argument sad_model %s not supported. Supported values are 'etape' or 'maya'" % sad_model)
+         picfname = os.path.join(
+             timeside.__path__[0], 'analyzer', 'trained_models', 'limsi_sad_%s.pkl' % sad_model)
          self.gmms = pickle.load(open(picfname, 'rb'))
  
 +        self.dews = dews
 +        self.speech_threshold = speech_threshold
 +        self.dllh_bounds = dllh_bounds
 +
      @staticmethod
      @interfacedoc
      def id():
          return frames, eod
  
      def post_process(self):
-         mfcc = self.process_pipe.results['yaafe.mfcc']['data_object']['value']
-         mfccd1 = self.process_pipe.results['yaafe.mfccd1']['data_object']['value']
-         mfccd2 = self.process_pipe.results['yaafe.mfccd2']['data_object']['value']
-         zcr = self.process_pipe.results['yaafe.zcr']['data_object']['value']       
 +        # extract signal features
 -
 -        features = np.concatenate((mfcc, mfccd1, mfccd2, zcr), axis=1)
 -
 -        res = 0.5 + 0.5 * \
 -            (self.gmms[0].llh(features) - self.gmms[1].llh(features))
 -
+         yaafe_result = self.process_pipe.results[self.parents['yaafe'].uuid()]
+         mfcc = yaafe_result['yaafe.mfcc']['data_object']['value']
+         mfccd1 = yaafe_result['yaafe.mfccd1']['data_object']['value']
+         mfccd2 = yaafe_result['yaafe.mfccd2']['data_object']['value']
+         zcr = yaafe_result['yaafe.zcr']['data_object']['value']
 +        features = N.concatenate((mfcc, mfccd1, mfccd2, zcr), axis=1)
 +
 +        # compute log likelihood difference
 +        res = 0.5 + 0.5 * (self.gmms[0].llh(features) - self.gmms[1].llh(features))
 +
 +        # bounds log likelihood difference
 +        if self.dllh_bounds is not None:
 +            mindiff, maxdiff = self.dllh_bounds
 +            res = N.minimum(N.maximum(res,  mindiff), maxdiff)
 +
 +        # performs dilation, erosion, erosion, dilatation
 +        ws = int(self.dews * float(self.input_samplerate ) / self.input_stepsize)
 +        deed_llh = dilatation(erosion(erosion(dilatation(res, ws), ws), ws), ws)
 +
 +        # infer speech and non speech segments from dilated
 +        # and erroded likelihood difference estimate
 +        last = None
 +        labels = []
 +        times = []
 +        durations = []
 +        for i, val in enumerate([1 if e > self.speech_threshold else 0 for e in deed_llh]):
 +            if val != last:
 +                labels.append(val)
 +                durations.append(1)
 +                times.append(i)
 +            else:
 +                durations[-1] += 1
 +            last = val
 +        times = [(float(e) * self.input_stepsize) / self.input_samplerate for e in times]
 +        durations = [(float(e) * self.input_stepsize) / self.input_samplerate for e in durations]
 +
 +
 +        # outputs the raw frame level speech/non speech log likelihood difference
          sad_result = self.new_result(data_mode='value', time_mode='framewise')
          sad_result.id_metadata.id += '.' + 'sad_lhh_diff'
 -        sad_result.id_metadata.name += ' ' + \
 -            'Speech Activity Detection Log Likelihood Difference'
 +        sad_result.id_metadata.name += ' ' + 'Speech Activity Detection Log Likelihood Difference'
          sad_result.data_object.value = res
 -        self.add_result(sad_result)
 +        self.process_pipe.results.add(sad_result)
 +
 +        # outputs frame level speech/non speech log likelihood difference
 +        # altered with erosion and dilatation procedures
 +        sad_de_result = self.new_result(data_mode='value', time_mode='framewise')
 +        sad_de_result.id_metadata.id += '.' + 'sad_de_lhh_diff'
 +        sad_de_result.id_metadata.name += ' ' + 'Speech Activity Detection Log Likelihood Difference | dilat | erode'
 +        sad_de_result.data_object.value = deed_llh
 +        self.process_pipe.results.add(sad_de_result)
 +
 +        # outputs speech/non speech segments
 +        sad_seg_result = self.new_result(data_mode='label', time_mode='segment')
 +        sad_seg_result.id_metadata.id += '.' + 'sad_segments'
 +        sad_seg_result.id_metadata.name += ' ' + 'Speech Activity Detection Segments'
 +        sad_seg_result.data_object.label = labels
 +        sad_seg_result.data_object.time = times
 +        sad_seg_result.data_object.duration = durations
 +        sad_seg_result.label_metadata.label = {0: 'Not Speech', 1: 'Speech'}
 +
 +        self.process_pipe.results.add(sad_seg_result)