# Author: Maxime Le Coz <lecoz@irit.fr>
-from timeside.core import Processor, implements, interfacedoc, FixedSizeInputAdapter
-from timeside.analyzer.core import *
+from timeside.core import implements, interfacedoc
+from timeside.analyzer.core import Analyzer
+from timeside.analyzer.utils import melFilterBank, computeModulation
+from timeside.analyzer.utils import segmentFromValues
from timeside.api import IAnalyzer
-from numpy import array,hamming,dot,mean
+from numpy import array, hamming, dot, mean, float
from numpy.fft import rfft
-from scipy.signal import firwin,lfilter
+from scipy.signal import firwin, lfilter
-class IRITSpeech4Hz(Processor):
+class IRITSpeech4Hz(Analyzer):
implements(IAnalyzer)
'''
Segmentor based on the analysis of the 4Hz energy modulation.
Properties:
- - energy4hz (list) : List of the 4Hz energy by frame for the modulation computation
- - threshold (float) : Threshold for the classification Speech/NonSpeech
- - frequency_center (float) : Center of the frequency range where the energy is extracted
- - frequency_width (float) : Width of the frequency range where the energy is extracted
- - orderFilter (int) : Order of the pass-band filter extracting the frequency range
- - normalizeEnergy (boolean) : Whether the energy must be normalized or not
- - nFFT (int) : Number of points for the FFT. Better if 512 <= nFFT <= 2048
- - nbFilters (int) : Length of the Mel Filter bank
- - melFilter (numpy array) : Mel Filter bank
- - modulLen (float) : Length (in second) of the modulation computation window
+ - energy4hz (list) : List of the 4Hz energy by frame for the modulation computation
+ - threshold (float) : Threshold for the classification Speech/NonSpeech
+ - frequency_center (float) : Center of the frequency range where the energy is extracted
+ - frequency_width (float) : Width of the frequency range where the energy is extracted
+ - orderFilter (int) : Order of the pass-band filter extracting the frequency range
+ - normalizeEnergy (boolean) : Whether the energy must be normalized or not
+ - nFFT (int) : Number of points for the FFT. Better if 512 <= nFFT <= 2048
+ - nbFilters (int) : Length of the Mel Filter bank
+ - melFilter (numpy array) : Mel Filter bank
+ - modulLen (float) : Length (in second) of the modulation computation window
'''
@interfacedoc
def setup(self, channels=None, samplerate=None, blocksize=None, totalframes=None):
- super(IRITSpeech4Hz, self).setup(channels, samplerate, blocksize, totalframes)
+ super(IRITSpeech4Hz, self).setup(
+ channels, samplerate, blocksize, totalframes)
self.energy4hz = []
- print "top"
# Classification
self.threshold = 2.0
# Pass-band Filter
self.frequency_center = 4.0
self.frequency_width = 0.5
- self.orderFilter=100
-
+ self.orderFilter = 100
self.normalizeEnergy = True
- self.nFFT=2048
- self.nbFilters =30
+ self.nFFT = 2048
+ self.nbFilters = 30
self.modulLen = 2.0
- self.melFilter = melFilterBank(self.nbFilters,self.nFFT,samplerate);
+ self.melFilter = melFilterBank(self.nbFilters, self.nFFT, samplerate)
@staticmethod
@interfacedoc
@staticmethod
@interfacedoc
def name():
- return "Speech entropy (IRIT)"
+ return "IRIT Speech 4Hz Modulation"
@staticmethod
@interfacedoc
return "Speech confidences indexes"
def process(self, frames, eod=False):
- '''
-
- '''
-
- frames = frames.T[0]
- # windowing of the frame (could be a changeable property)
- w = frames * hamming(len(frames));
-
- # Mel scale spectrum extraction
- f = abs(rfft(w,n=2*self.nFFT)[0:self.nFFT])
- e = dot(f**2,self.melFilter)
-
- self.energy4hz.append(e)
-
- return frames, eod
-
- def results(self):
- '''
-
- '''
- print "Results"
- # Creation of the pass-band filter
- Wo = self.frequency_center/self.samplerate() ;
- Wn = [ Wo-(self.frequency_width/2)/self.samplerate() , Wo+(self.frequency_width/2)/self.samplerate()];
- num = firwin(self.orderFilter, Wn,pass_zero=False);
-
-
- # Energy on the frequency range
- self.energy4hz=numpy.array(self.energy4hz)
- energy = lfilter(num,1,self.energy4hz.T,0)
- energy = sum(energy)
-
- # Normalization
- if self.normalizeEnergy :
- energy =energy/mean(energy)
-
- # Energy Modulation
- frameLenModulation = int(self.modulLen*self.samplerate()/self.blocksize())
- modEnergyValue =computeModulation(energy,frameLenModulation,True)
-
- # Confidence Index
- conf = array(modEnergyValue-self.threshold)/self.threshold
- conf[conf>1] = 1
-
- modEnergy = AnalyzerResult(id = "irit_4hzenergy_confidence", name = "modulation energie (IRIT)", unit = "?")
- modEnergy.value = conf
- convert = {False:'NonSpeech',True:'Speech'}
-
- segList = segmentFromValues(modEnergyValue>self.threshold)
- segmentsEntropy =[]
- for s in segList :
- segmentsEntropy.append((numpy.float(s[0])*self.blocksize()/self.samplerate(),
- numpy.float(s[1])*self.blocksize()/self.samplerate(),
- convert[s[2]]))
-
- segs = AnalyzerResult(id="irit_4hzenergy_segments", name="seg 4Hz (IRIT)", unit="s")
- segs.value = segmentsEntropy
- return AnalyzerResultContainer([modEnergy,segs])
+ '''
+
+ '''
+
+ frames = frames.T[0]
+ # windowing of the frame (could be a changeable property)
+ w = frames * hamming(len(frames))
+
+ # Mel scale spectrum extraction
+ f = abs(rfft(w, n=2 * self.nFFT)[0:self.nFFT])
+ e = dot(f ** 2, self.melFilter)
+
+ self.energy4hz.append(e)
+
+ return frames, eod
+
+ def release(self):
+ '''
+
+ '''
+ # Creation of the pass-band filter
+ Wo = self.frequency_center / self.samplerate()
+ Wn = [Wo - (self.frequency_width / 2) / self.samplerate(),
+ Wo + (self.frequency_width / 2) / self.samplerate()]
+ num = firwin(self.orderFilter, Wn, pass_zero=False)
+
+ # Energy on the frequency range
+ self.energy4hz = array(self.energy4hz)
+ energy = lfilter(num, 1, self.energy4hz.T, 0)
+ energy = sum(energy)
+
+ # Normalization
+ if self.normalizeEnergy:
+ energy = energy / mean(energy)
+
+ # Energy Modulation
+ frameLenModulation = int(
+ self.modulLen * self.samplerate() / self.blocksize())
+ modEnergyValue = computeModulation(energy, frameLenModulation, True)
+
+ # Confidence Index
+ conf = array(modEnergyValue - self.threshold) / self.threshold
+ conf[conf > 1] = 1
+
+ modEnergy = self.new_result(data_mode='value', time_mode='framewise')
+ modEnergy.id_metadata.id += '.' + 'energy_confidence'
+ modEnergy.id_metadata.name += ' ' + 'Energy Confidence'
+
+ modEnergy.data_object.value = conf
+
+ self._results.add(modEnergy)
+
+ # Segment
+ convert = {False: 0, True: 1}
+ label = {0: 'nonSpeech', 1: 'Speech'}
+
+ segList = segmentFromValues(modEnergyValue > self.threshold)
+
+ segs = self.new_result(data_mode='label', time_mode='segment')
+ segs.id_metadata.id += '.' + 'segments'
+ segs.id_metadata.name += ' ' + 'Segments'
+
+ segs.label_metadata.label = label
+
+ segs.data_object.label = [convert[s[2]] for s in segList]
+ segs.data_object.time = [(float(s[0]) * self.blocksize() /
+ self.samplerate())
+ for s in segList]
+ segs.data_object.duration = [(float(s[1]-s[0]) * self.blocksize() /
+ self.samplerate())
+ for s in segList]
+
+ self._results.add(segs)
+
+ return
# Author: Maxime Le Coz <lecoz@irit.fr>
from timeside.core import Processor, implements, interfacedoc
-from timeside.analyzer.core import *
+from timeside.analyzer.core import Analyzer
+from timeside.analyzer.utils import entropy, computeModulation
+from timeside.analyzer.utils import segmentFromValues
from timeside.api import IAnalyzer
from numpy import array
from scipy.ndimage.morphology import binary_opening
-class IRITSpeechEntropy(Processor):
+class IRITSpeechEntropy(Analyzer):
implements(IAnalyzer)
@interfacedoc
def setup(self, channels=None, samplerate=None, blocksize=None, totalframes=None):
- super(IRITSpeechEntropy, self).setup(channels, samplerate, blocksize, totalframes)
+ super(IRITSpeechEntropy, self).setup(
+ channels, samplerate, blocksize, totalframes)
self.entropyValue = []
self.threshold = 0.4
self.smoothLen = 5
@staticmethod
@interfacedoc
def name():
- return "Speech entropy (IRIT)"
+ return "IRIT Speech entropy"
@staticmethod
@interfacedoc
self.entropyValue.append(entropy(frames))
return frames, eod
- def results(self):
+ def release(self):
- entropyValue = numpy.array(self.entropyValue)
- w = self.modulLen*self.samplerate()/self.blocksize()
- modulentropy = computeModulation(entropyValue,w,False)
- confEntropy= array(modulentropy-self.threshold)/self.threshold
- confEntropy[confEntropy>1] = 1
+ entropyValue = array(self.entropyValue)
+ w = self.modulLen * self.samplerate() / self.blocksize()
+ modulentropy = computeModulation(entropyValue, w, False)
+ confEntropy = array(modulentropy - self.threshold) / self.threshold
+ confEntropy[confEntropy > 1] = 1
- conf = AnalyzerResult(id = "irit_entropy_confidence", name = "entropy (IRIT)", unit = "?")
- conf.value = confEntropy
+ conf = self.new_result(data_mode='value', time_mode='framewise')
+ conf.id_metadata.id += '.' + 'confidence'
+ conf.id_metadata.name += ' ' + 'Confidence'
+
+ conf.data_object.value = confEntropy
+ self._results.add(conf)
+
+ # Binary Entropy
binaryEntropy = modulentropy > self.threshold
- binaryEntropy = binary_opening(binaryEntropy,[1]*(self.smoothLen*2))
+ binaryEntropy = binary_opening(
+ binaryEntropy, [1] * (self.smoothLen * 2))
- convert = {False:'NonSpeech',True:'Speech'}
+ convert = {False: 0, True: 1}
+ label = {0: 'NonSpeech', 1: 'Speech'}
segList = segmentFromValues(binaryEntropy)
- segmentsEntropy =[]
- for s in segList :
- segmentsEntropy.append((numpy.float(s[0])*self.blocksize()/self.samplerate(),
- numpy.float(s[1])*self.blocksize()/self.samplerate(),
- convert[s[2]]))
- segs = AnalyzerResult(id="irit_entropy_segments", name="seg entropy (IRIT)", unit="s")
- segs.value = segmentsEntropy
- return AnalyzerResultContainer([conf, segs])
+ segs = self.new_result(data_mode='label', time_mode='segment')
+ segs.id_metadata.id += '.' + 'segments'
+ segs.id_metadata.name += ' ' + 'Segments'
+
+ segs.data_object.label = segList
+
+ segs.data_object.label = [convert[s[2]] for s in segList]
+ segs.data_object.time = [(float(s[0]) * self.blocksize() /
+ self.samplerate())
+ for s in segList]
+ segs.data_object.duration = [(float(s[1]-s[0]) * self.blocksize() /
+ self.samplerate())
+ for s in segList]
+
+ self._results.add(segs)
+
+ return