# Automatically write known metadata
result.id_metadata.date = datetime.now().replace(
microsecond=0).isoformat(' ')
- result.id_metadata.version = timeside.__version__
+ #result.id_metadata.version = timeside.__version__
result.id_metadata.author = 'TimeSide'
result.id_metadata.id = self.id()
result.id_metadata.name = self.name()
from timeside.analyzer.utils import segmentFromValues
from timeside.analyzer import IRITDiverg
from timeside.api import IAnalyzer
-from numpy import logical_and,array, hamming, dot, mean, float, arange, nonzero
-from numpy.fft import rfft
-from scipy.signal import firwin, lfilter
+from numpy import mean, diff, arange
from timeside.analyzer.preprocessors import frames_adapter
self.parents.append(IRITDiverg())
self.wLen = 1.0
self.wStep = 0.1
- self.threshold = 20
+ self.threshold = 0.05
self.input_blocksize = 0;
- self.input_stepsize = 0;
-
+ self.input_stepsize = 0;
+ self.maxSegForLength = 7
@interfacedoc
def setup(self, channels=None, samplerate=None, blocksize=None,
totalframes=None):
'''
'''
+
+ segList = self.process_pipe.results['irit_diverg.segments'].time
- segList = self.process_pipe.results['irit_diverg.segments'].time
- w = self.wLen/ 2;
+ w = self.wLen/ 2
end = segList[-1]
- tLine = arange(0,end,self.wStep)
- segLen = array([0]*len(tLine))
-
- for i,t in enumerate(tLine):
- idx = nonzero(logical_and(segList>(t-w) ,segList<(t+w)))[0]
- segLen[i]= len(idx)
+ tLine = arange(w,end-w,self.wStep)
+ # Les plus petits ! <> article
+ segLen = [mean(diff(getBoundariesInInterval(t-w, t+w, segList))) for t in tLine]
+
# Confidence Index
- conf = array(segLen - self.threshold) / self.threshold
- conf[conf > 1] = 1
+ conf = [(s - self.threshold) / self.threshold if s < 2*self.threshold else 1 for s in segLen]
segLenRes = self.new_result(data_mode='value', time_mode='framewise')
segLenRes.id_metadata.id += '.' + 'energy_confidence'
convert = {False: 0, True: 1}
label = {0: 'nonMusic', 1: 'Music'}
- segList = segmentFromValues(segLen > self.threshold)
+ segList = segmentFromValues([s > self.threshold for s in segLen])
# Hint : Median filtering could imrove smoothness of the result
# from scipy.signal import medfilt
# segList = segmentFromValues(medfilt(modEnergyValue > self.threshold, 31))
self.process_pipe.results.add(segs)
return
+
+
+def getBoundariesInInterval(start,stop,boundaries) :
+ return [t for t in boundaries if t >= start and t<= stop]
+
+
from timeside.analyzer.utils import segmentFromValues
from timeside.analyzer import IRITDiverg
from timeside.api import IAnalyzer
-from numpy import logical_and,array, hamming, dot, mean, float, arange, nonzero
-from numpy.fft import rfft
-from scipy.signal import firwin, lfilter
+from numpy import array, mean, arange, nonzero
from timeside.analyzer.preprocessors import frames_adapter
'''
segList = self.process_pipe.results['irit_diverg.segments'].time
- w = self.wLen/ 2;
+ w = self.wLen/ 2
end = segList[-1]
- tLine = arange(0,end,self.wStep)
+ tLine = arange(0, end, self.wStep)
- segLen = array([0]*len(tLine))
-
- for i,t in enumerate(tLine):
- idx = nonzero(logical_and(segList>(t-w) ,segList<(t+w)))[0]
- l = [tLine[t1]-tLine[t2] for t1,t2 in zip()]
- segLen[i]= len(idx)
+ segNB = [ len(getBoundariesInInterval(t-w,t+w,segList)) for t in tLine ]
# Confidence Index
- conf = array(segLen - self.threshold) / self.threshold
- conf[conf > 1] = 1
-
+ conf = [float(v - self.threshold) / float(self.threshold) if v < 2*self.threshold else 1.0 for v in segNB]
segLenRes = self.new_result(data_mode='value', time_mode='framewise')
segLenRes.id_metadata.id += '.' + 'energy_confidence'
segLenRes.id_metadata.name += ' ' + 'Energy Confidence'
- segLenRes.data_object.value = segLen
+ segLenRes.data_object.value = conf
self.process_pipe.results.add(segLenRes)
convert = {False: 0, True: 1}
label = {0: 'nonMusic', 1: 'Music'}
- segList = segmentFromValues(segLen > self.threshold)
+ segList = segmentFromValues([c > 0 for c in conf])
# Hint : Median filtering could imrove smoothness of the result
# from scipy.signal import medfilt
# segList = segmentFromValues(medfilt(modEnergyValue > self.threshold, 31))
self.process_pipe.results.add(segs)
return
+
+def getBoundariesInInterval(start, stop, boundaries) :
+ return [t for t in boundaries if t >= start and t<= stop]
+
--- /dev/null
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2013 Maxime Le Coz <lecoz@irit.fr>
+
+# This file is part of TimeSide.
+
+# TimeSide is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 2 of the License, or
+# (at your option) any later version.
+
+# TimeSide is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with TimeSide. If not, see <http://www.gnu.org/licenses/>.
+
+# Author: Maxime Le Coz <lecoz@irit.fr>
+
+from timeside.core import implements, interfacedoc
+from timeside.analyzer.core import Analyzer
+from timeside.analyzer.utils import melFilterBank, computeModulation
+from timeside.analyzer.utils import segmentFromValues
+from timeside.api import IAnalyzer
+from numpy import array, hamming, dot, mean, float
+from numpy.fft import rfft
+from scipy.signal import firwin, lfilter
+from timeside.analyzer.preprocessors import frames_adapter
+
+
+class IRITSpeech4Hz(Analyzer):
+ '''Speech Segmentor based on the 4Hz energy modulation analysis.
+
+ Properties:
+ - energy4hz (list) : List of the 4Hz energy by frame for the modulation computation
+ - threshold (float) : Threshold for the classification Speech/NonSpeech
+ - frequency_center (float) : Center of the frequency range where the energy is extracted
+ - frequency_width (float) : Width of the frequency range where the energy is extracted
+ - orderFilter (int) : Order of the pass-band filter extracting the frequency range
+ - normalizeEnergy (boolean) : Whether the energy must be normalized or not
+ - nFFT (int) : Number of points for the FFT. Better if 512 <= nFFT <= 2048
+ - nbFilters (int) : Length of the Mel Filter bank
+ - melFilter (numpy array) : Mel Filter bank
+ - modulLen (float) : Length (in second) of the modulation computation window
+ '''
+
+<<<<<<< HEAD
+ @interfacedoc
+=======
+ implements(IAnalyzer)
+
+ @interfacedoc
+>>>>>>> 7c3ccb1c5b87c4639fee32df595cca1991265657
+ def setup(self, channels=None, samplerate=None, blocksize=None,
+ totalframes=None):
+ super(IRITSpeech4Hz, self).setup(
+ channels, samplerate, blocksize, totalframes)
+
+ self.energy4hz = []
+ # Classification
+ self.threshold = 2.0
+
+ self.wLen = 1.0
+ self.wStep = 0.1
+ self.input_blocksize = int(self.wLen * samplerate)
+ self.input_stepsize = int(self.wStep * samplerate)
+
+ # Pass-band Filter
+ self.frequency_center = 4.0
+ self.frequency_width = 0.5
+ self.orderFilter = 100
+
+ self.normalizeEnergy = True
+ self.nFFT = 2048
+ self.nbFilters = 30
+ self.modulLen = 2.0
+ self.melFilter = melFilterBank(self.nbFilters, self.nFFT, samplerate)
+
+ @staticmethod
+ @interfacedoc
+ def id():
+ return "irit_speech_4hz"
+
+ @staticmethod
+ @interfacedoc
+ def name():
+ return "IRIT Speech 4Hz Modulation"
+
+ @staticmethod
+ @interfacedoc
+ def unit():
+ return ""
+
+ def __str__(self):
+ return "Speech confidences indexes"
+
+ @frames_adapter
+ def process(self, frames, eod=False):
+ frames = frames.T[0]
+ # windowing of the frame (could be a changeable property)
+ w = frames * hamming(len(frames))
+
+ # Mel scale spectrum extraction
+ f = abs(rfft(w, n=2 * self.nFFT)[0:self.nFFT])
+ e = dot(f ** 2, self.melFilter)
+
+ self.energy4hz.append(e)
+
+ return frames, eod
+
+ def post_process(self):
+ '''
+
+ '''
+ # Creation of the pass-band filter
+ Wo = self.frequency_center / self.samplerate()
+ Wn = [Wo - (self.frequency_width / 2) / self.samplerate(),
+ Wo + (self.frequency_width / 2) / self.samplerate()]
+ num = firwin(self.orderFilter, Wn, pass_zero=False)
+
+ # Energy on the frequency range
+ self.energy4hz = array(self.energy4hz)
+ energy = lfilter(num, 1, self.energy4hz.T, 0)
+ energy = sum(energy)
+
+ # Normalization
+ if self.normalizeEnergy:
+ energy = energy / mean(energy)
+
+ # Energy Modulation
+ frameLenModulation = int(
+ self.modulLen * self.samplerate() / self.input_blocksize)
+ modEnergyValue = computeModulation(energy, frameLenModulation, True)
+
+ # Confidence Index
+ conf = array(modEnergyValue - self.threshold) / self.threshold
+ conf[conf > 1] = 1
+
+ modEnergy = self.new_result(data_mode='value', time_mode='framewise')
+ modEnergy.id_metadata.id += '.' + 'energy_confidence'
+ modEnergy.id_metadata.name += ' ' + 'Energy Confidence'
+
+ modEnergy.data_object.value = conf
+
+ self.process_pipe.results.add(modEnergy)
+
+ # Segment
+ convert = {False: 0, True: 1}
+ label = {0: 'nonSpeech', 1: 'Speech'}
+
+ segList = segmentFromValues(modEnergyValue > self.threshold)
+ # Hint : Median filtering could imrove smoothness of the result
+ # from scipy.signal import medfilt
+ # segList = segmentFromValues(medfilt(modEnergyValue > self.threshold, 31))
+
+ segs = self.new_result(data_mode='label', time_mode='segment')
+ segs.id_metadata.id += '.' + 'segments'
+ segs.id_metadata.name += ' ' + 'Segments'
+
+ segs.label_metadata.label = label
+
+ segs.data_object.label = [convert[s[2]] for s in segList]
+ segs.data_object.time = [(float(s[0]) * self.input_blocksize /
+ self.samplerate())
+ for s in segList]
+<<<<<<< HEAD
+ segs.data_object.duration = [(float(s[1]-s[0]) * self.input_blocksize /
+=======
+ segs.data_object.duration = [(float(s[1]-s[0]+1) * self.blocksize() /
+>>>>>>> 7c3ccb1c5b87c4639fee32df595cca1991265657
+ self.samplerate())
+ for s in segList]
+
+ self.process_pipe.results.add(segs)
+
+ return
def post_process(self):
entropyValue = array(self.entropyValue)
+
+ import pylab
+ pylab.plot(entropyValue)
+ pylab.show()
w = self.modulLen/self.wStep
- print w,len(entropyValue)
modulentropy = computeModulation(entropyValue, w, False)
- import pylab
- pylab.plot(modulentropy)
- pylab.show()
+
confEntropy = array(modulentropy - self.threshold) / self.threshold
confEntropy[confEntropy > 1] = 1
--- /dev/null
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2013 Maxime Le Coz <lecoz@irit.fr>
+
+# This file is part of TimeSide.
+
+# TimeSide is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 2 of the License, or
+# (at your option) any later version.
+
+# TimeSide is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with TimeSide. If not, see <http://www.gnu.org/licenses/>.
+
+# Author: Maxime Le Coz <lecoz@irit.fr>
+
+from timeside.core import implements, interfacedoc
+from timeside.analyzer.core import Analyzer
+from timeside.analyzer.utils import entropy, computeModulation
+from timeside.analyzer.utils import segmentFromValues
+from timeside.api import IAnalyzer
+from numpy import array
+from scipy.ndimage.morphology import binary_opening
+from timeside.analyzer.preprocessors import frames_adapter
+
+
+class IRITSpeechEntropy(Analyzer):
+ """Speech Segmentor based on Entropy analysis."""
+
+ implements(IAnalyzer)
+
+ @interfacedoc
+ def setup(self, channels=None, samplerate=None, blocksize=None,
+ totalframes=None):
+ super(IRITSpeechEntropy, self).setup(
+ channels, samplerate, blocksize, totalframes)
+ self.entropyValue = []
+ self.threshold = 0.4
+ self.smoothLen = 5
+ self.modulLen = 2
+ self.wLen = 1.0
+ self.wStep = 0.1
+ self.input_blocksize = int(self.wLen * samplerate)
+ self.input_stepsize = int(self.wStep * samplerate)
+
+ @staticmethod
+ @interfacedoc
+ def id():
+ return "irit_speech_entropy"
+
+ @staticmethod
+ @interfacedoc
+ def name():
+ return "IRIT Speech entropy"
+
+ @staticmethod
+ @interfacedoc
+ def unit():
+ return ""
+
+ def __str__(self):
+ return "Speech confidences indexes"
+
+ @frames_adapter
+ def process(self, frames, eod=False):
+ self.entropyValue.append(entropy(frames))
+ return frames, eod
+
+ def post_process(self):
+ entropyValue = array(self.entropyValue)
+ w = self.modulLen * self.samplerate() / self.input_blocksize
+ modulentropy = computeModulation(entropyValue, w, False)
+ confEntropy = array(modulentropy - self.threshold) / self.threshold
+ confEntropy[confEntropy > 1] = 1
+
+ conf = self.new_result(data_mode='value', time_mode='framewise')
+
+ conf.id_metadata.id += '.' + 'confidence'
+ conf.id_metadata.name += ' ' + 'Confidence'
+
+ conf.data_object.value = confEntropy
+ self.process_pipe.results.add(conf)
+
+ # Binary Entropy
+ binaryEntropy = modulentropy > self.threshold
+ binaryEntropy = binary_opening(
+ binaryEntropy, [1] * (self.smoothLen * 2))
+
+ convert = {False: 0, True: 1}
+ label = {0: 'NonSpeech', 1: 'Speech'}
+ segList = segmentFromValues(binaryEntropy)
+
+ segs = self.new_result(data_mode='label', time_mode='segment')
+ segs.id_metadata.id += '.' + 'segments'
+ segs.id_metadata.name += ' ' + 'Segments'
+
+ segs.label_metadata.label = label
+
+ segs.data_object.label = [convert[s[2]] for s in segList]
+ segs.data_object.time = [(float(s[0]) * self.input_blocksize /
+ self.samplerate())
+ for s in segList]
+<<<<<<< HEAD
+ segs.data_object.duration = [(float(s[1]-s[0]) * self.input_blocksize /
+=======
+ segs.data_object.duration = [(float(s[1]-s[0]+1) * self.blocksize() /
+>>>>>>> 7c3ccb1c5b87c4639fee32df595cca1991265657
+ self.samplerate())
+ for s in segList]
+
+ self.process_pipe.results.add(segs)
+
+ return