From 35209fd0ec9dd54d9fe9a98b63b66c30c7bed715 Mon Sep 17 00:00:00 2001 From: Thomas Fillon Date: Tue, 15 Oct 2013 23:42:23 +0200 Subject: [PATCH] switch IRIT analyzers to the new Analyzers structure --- timeside/analyzer/__init__.py | 4 +- timeside/analyzer/irit_speech_4hz.py | 180 +++++++++++++---------- timeside/analyzer/irit_speech_entropy.py | 64 +++++--- 3 files changed, 143 insertions(+), 105 deletions(-) diff --git a/timeside/analyzer/__init__.py b/timeside/analyzer/__init__.py index d9c7444..663ce64 100644 --- a/timeside/analyzer/__init__.py +++ b/timeside/analyzer/__init__.py @@ -11,5 +11,5 @@ from yaafe import * # TF : add Yaafe analyzer from spectrogram import Spectrogram from waveform import Waveform from vamp_plugin import VampSimpleHost -#from irit_speech_entropy import * -#from irit_speech_4hz import * +from irit_speech_entropy import IRITSpeechEntropy +from irit_speech_4hz import IRITSpeech4Hz diff --git a/timeside/analyzer/irit_speech_4hz.py b/timeside/analyzer/irit_speech_4hz.py index c3f0923..8d05487 100644 --- a/timeside/analyzer/irit_speech_4hz.py +++ b/timeside/analyzer/irit_speech_4hz.py @@ -19,51 +19,52 @@ # Author: Maxime Le Coz -from timeside.core import Processor, implements, interfacedoc, FixedSizeInputAdapter -from timeside.analyzer.core import * +from timeside.core import implements, interfacedoc +from timeside.analyzer.core import Analyzer +from timeside.analyzer.utils import melFilterBank, computeModulation +from timeside.analyzer.utils import segmentFromValues from timeside.api import IAnalyzer -from numpy import array,hamming,dot,mean +from numpy import array, hamming, dot, mean, float from numpy.fft import rfft -from scipy.signal import firwin,lfilter +from scipy.signal import firwin, lfilter -class IRITSpeech4Hz(Processor): +class IRITSpeech4Hz(Analyzer): implements(IAnalyzer) ''' Segmentor based on the analysis of the 4Hz energy modulation. Properties: - - energy4hz (list) : List of the 4Hz energy by frame for the modulation computation - - threshold (float) : Threshold for the classification Speech/NonSpeech - - frequency_center (float) : Center of the frequency range where the energy is extracted - - frequency_width (float) : Width of the frequency range where the energy is extracted - - orderFilter (int) : Order of the pass-band filter extracting the frequency range - - normalizeEnergy (boolean) : Whether the energy must be normalized or not - - nFFT (int) : Number of points for the FFT. Better if 512 <= nFFT <= 2048 - - nbFilters (int) : Length of the Mel Filter bank - - melFilter (numpy array) : Mel Filter bank - - modulLen (float) : Length (in second) of the modulation computation window + - energy4hz (list) : List of the 4Hz energy by frame for the modulation computation + - threshold (float) : Threshold for the classification Speech/NonSpeech + - frequency_center (float) : Center of the frequency range where the energy is extracted + - frequency_width (float) : Width of the frequency range where the energy is extracted + - orderFilter (int) : Order of the pass-band filter extracting the frequency range + - normalizeEnergy (boolean) : Whether the energy must be normalized or not + - nFFT (int) : Number of points for the FFT. Better if 512 <= nFFT <= 2048 + - nbFilters (int) : Length of the Mel Filter bank + - melFilter (numpy array) : Mel Filter bank + - modulLen (float) : Length (in second) of the modulation computation window ''' @interfacedoc def setup(self, channels=None, samplerate=None, blocksize=None, totalframes=None): - super(IRITSpeech4Hz, self).setup(channels, samplerate, blocksize, totalframes) + super(IRITSpeech4Hz, self).setup( + channels, samplerate, blocksize, totalframes) self.energy4hz = [] - print "top" # Classification self.threshold = 2.0 # Pass-band Filter self.frequency_center = 4.0 self.frequency_width = 0.5 - self.orderFilter=100 - + self.orderFilter = 100 self.normalizeEnergy = True - self.nFFT=2048 - self.nbFilters =30 + self.nFFT = 2048 + self.nbFilters = 30 self.modulLen = 2.0 - self.melFilter = melFilterBank(self.nbFilters,self.nFFT,samplerate); + self.melFilter = melFilterBank(self.nbFilters, self.nFFT, samplerate) @staticmethod @interfacedoc @@ -73,7 +74,7 @@ class IRITSpeech4Hz(Processor): @staticmethod @interfacedoc def name(): - return "Speech entropy (IRIT)" + return "IRIT Speech 4Hz Modulation" @staticmethod @interfacedoc @@ -84,61 +85,78 @@ class IRITSpeech4Hz(Processor): return "Speech confidences indexes" def process(self, frames, eod=False): - ''' - - ''' - - frames = frames.T[0] - # windowing of the frame (could be a changeable property) - w = frames * hamming(len(frames)); - - # Mel scale spectrum extraction - f = abs(rfft(w,n=2*self.nFFT)[0:self.nFFT]) - e = dot(f**2,self.melFilter) - - self.energy4hz.append(e) - - return frames, eod - - def results(self): - ''' - - ''' - print "Results" - # Creation of the pass-band filter - Wo = self.frequency_center/self.samplerate() ; - Wn = [ Wo-(self.frequency_width/2)/self.samplerate() , Wo+(self.frequency_width/2)/self.samplerate()]; - num = firwin(self.orderFilter, Wn,pass_zero=False); - - - # Energy on the frequency range - self.energy4hz=numpy.array(self.energy4hz) - energy = lfilter(num,1,self.energy4hz.T,0) - energy = sum(energy) - - # Normalization - if self.normalizeEnergy : - energy =energy/mean(energy) - - # Energy Modulation - frameLenModulation = int(self.modulLen*self.samplerate()/self.blocksize()) - modEnergyValue =computeModulation(energy,frameLenModulation,True) - - # Confidence Index - conf = array(modEnergyValue-self.threshold)/self.threshold - conf[conf>1] = 1 - - modEnergy = AnalyzerResult(id = "irit_4hzenergy_confidence", name = "modulation energie (IRIT)", unit = "?") - modEnergy.value = conf - convert = {False:'NonSpeech',True:'Speech'} - - segList = segmentFromValues(modEnergyValue>self.threshold) - segmentsEntropy =[] - for s in segList : - segmentsEntropy.append((numpy.float(s[0])*self.blocksize()/self.samplerate(), - numpy.float(s[1])*self.blocksize()/self.samplerate(), - convert[s[2]])) - - segs = AnalyzerResult(id="irit_4hzenergy_segments", name="seg 4Hz (IRIT)", unit="s") - segs.value = segmentsEntropy - return AnalyzerResultContainer([modEnergy,segs]) + ''' + + ''' + + frames = frames.T[0] + # windowing of the frame (could be a changeable property) + w = frames * hamming(len(frames)) + + # Mel scale spectrum extraction + f = abs(rfft(w, n=2 * self.nFFT)[0:self.nFFT]) + e = dot(f ** 2, self.melFilter) + + self.energy4hz.append(e) + + return frames, eod + + def release(self): + ''' + + ''' + # Creation of the pass-band filter + Wo = self.frequency_center / self.samplerate() + Wn = [Wo - (self.frequency_width / 2) / self.samplerate(), + Wo + (self.frequency_width / 2) / self.samplerate()] + num = firwin(self.orderFilter, Wn, pass_zero=False) + + # Energy on the frequency range + self.energy4hz = array(self.energy4hz) + energy = lfilter(num, 1, self.energy4hz.T, 0) + energy = sum(energy) + + # Normalization + if self.normalizeEnergy: + energy = energy / mean(energy) + + # Energy Modulation + frameLenModulation = int( + self.modulLen * self.samplerate() / self.blocksize()) + modEnergyValue = computeModulation(energy, frameLenModulation, True) + + # Confidence Index + conf = array(modEnergyValue - self.threshold) / self.threshold + conf[conf > 1] = 1 + + modEnergy = self.new_result(data_mode='value', time_mode='framewise') + modEnergy.id_metadata.id += '.' + 'energy_confidence' + modEnergy.id_metadata.name += ' ' + 'Energy Confidence' + + modEnergy.data_object.value = conf + + self._results.add(modEnergy) + + # Segment + convert = {False: 0, True: 1} + label = {0: 'nonSpeech', 1: 'Speech'} + + segList = segmentFromValues(modEnergyValue > self.threshold) + + segs = self.new_result(data_mode='label', time_mode='segment') + segs.id_metadata.id += '.' + 'segments' + segs.id_metadata.name += ' ' + 'Segments' + + segs.label_metadata.label = label + + segs.data_object.label = [convert[s[2]] for s in segList] + segs.data_object.time = [(float(s[0]) * self.blocksize() / + self.samplerate()) + for s in segList] + segs.data_object.duration = [(float(s[1]-s[0]) * self.blocksize() / + self.samplerate()) + for s in segList] + + self._results.add(segs) + + return diff --git a/timeside/analyzer/irit_speech_entropy.py b/timeside/analyzer/irit_speech_entropy.py index 73ff62b..bc034b4 100644 --- a/timeside/analyzer/irit_speech_entropy.py +++ b/timeside/analyzer/irit_speech_entropy.py @@ -20,18 +20,21 @@ # Author: Maxime Le Coz from timeside.core import Processor, implements, interfacedoc -from timeside.analyzer.core import * +from timeside.analyzer.core import Analyzer +from timeside.analyzer.utils import entropy, computeModulation +from timeside.analyzer.utils import segmentFromValues from timeside.api import IAnalyzer from numpy import array from scipy.ndimage.morphology import binary_opening -class IRITSpeechEntropy(Processor): +class IRITSpeechEntropy(Analyzer): implements(IAnalyzer) @interfacedoc def setup(self, channels=None, samplerate=None, blocksize=None, totalframes=None): - super(IRITSpeechEntropy, self).setup(channels, samplerate, blocksize, totalframes) + super(IRITSpeechEntropy, self).setup( + channels, samplerate, blocksize, totalframes) self.entropyValue = [] self.threshold = 0.4 self.smoothLen = 5 @@ -45,7 +48,7 @@ class IRITSpeechEntropy(Processor): @staticmethod @interfacedoc def name(): - return "Speech entropy (IRIT)" + return "IRIT Speech entropy" @staticmethod @interfacedoc @@ -59,30 +62,47 @@ class IRITSpeechEntropy(Processor): self.entropyValue.append(entropy(frames)) return frames, eod - def results(self): + def release(self): - entropyValue = numpy.array(self.entropyValue) - w = self.modulLen*self.samplerate()/self.blocksize() - modulentropy = computeModulation(entropyValue,w,False) - confEntropy= array(modulentropy-self.threshold)/self.threshold - confEntropy[confEntropy>1] = 1 + entropyValue = array(self.entropyValue) + w = self.modulLen * self.samplerate() / self.blocksize() + modulentropy = computeModulation(entropyValue, w, False) + confEntropy = array(modulentropy - self.threshold) / self.threshold + confEntropy[confEntropy > 1] = 1 - conf = AnalyzerResult(id = "irit_entropy_confidence", name = "entropy (IRIT)", unit = "?") - conf.value = confEntropy + conf = self.new_result(data_mode='value', time_mode='framewise') + conf.id_metadata.id += '.' + 'confidence' + conf.id_metadata.name += ' ' + 'Confidence' + + conf.data_object.value = confEntropy + self._results.add(conf) + + # Binary Entropy binaryEntropy = modulentropy > self.threshold - binaryEntropy = binary_opening(binaryEntropy,[1]*(self.smoothLen*2)) + binaryEntropy = binary_opening( + binaryEntropy, [1] * (self.smoothLen * 2)) - convert = {False:'NonSpeech',True:'Speech'} + convert = {False: 0, True: 1} + label = {0: 'NonSpeech', 1: 'Speech'} segList = segmentFromValues(binaryEntropy) - segmentsEntropy =[] - for s in segList : - segmentsEntropy.append((numpy.float(s[0])*self.blocksize()/self.samplerate(), - numpy.float(s[1])*self.blocksize()/self.samplerate(), - convert[s[2]])) - segs = AnalyzerResult(id="irit_entropy_segments", name="seg entropy (IRIT)", unit="s") - segs.value = segmentsEntropy - return AnalyzerResultContainer([conf, segs]) + segs = self.new_result(data_mode='label', time_mode='segment') + segs.id_metadata.id += '.' + 'segments' + segs.id_metadata.name += ' ' + 'Segments' + + segs.data_object.label = segList + + segs.data_object.label = [convert[s[2]] for s in segList] + segs.data_object.time = [(float(s[0]) * self.blocksize() / + self.samplerate()) + for s in segList] + segs.data_object.duration = [(float(s[1]-s[0]) * self.blocksize() / + self.samplerate()) + for s in segList] + + self._results.add(segs) + + return -- 2.39.5