]> git.parisson.com Git - timeside.git/commitdiff
Optimised music detector
authorMaxime LE COZ <lecoz@irit.fr>
Tue, 11 Mar 2014 15:26:56 +0000 (16:26 +0100)
committerMaxime LE COZ <lecoz@irit.fr>
Tue, 11 Mar 2014 15:26:56 +0000 (16:26 +0100)
timeside/analyzer/core.py
timeside/analyzer/irit_music_SLN.py
timeside/analyzer/irit_music_SNB.py
timeside/analyzer/irit_speech_4hz.py.orig [new file with mode: 0644]
timeside/analyzer/irit_speech_entropy.py
timeside/analyzer/irit_speech_entropy.py.orig [new file with mode: 0644]
timeside/analyzer/startShapes.dat [new file with mode: 0644]

index f0e649f3cff1bbdbc8dd67531f963d5f4450e14d..9424f6767d1da9abbb2709f2eda3c182509ac443 100644 (file)
@@ -1072,7 +1072,7 @@ class Analyzer(Processor):
         # Automatically write known metadata
         result.id_metadata.date = datetime.now().replace(
             microsecond=0).isoformat(' ')
-        result.id_metadata.version = timeside.__version__
+        #result.id_metadata.version = timeside.__version__
         result.id_metadata.author = 'TimeSide'
         result.id_metadata.id = self.id()
         result.id_metadata.name = self.name()
index 814cc831942ebfc714ef005868ed1bb3b7c19032..ab1e7141952e1eaa1d810be592ab8f0e7db6c2aa 100644 (file)
@@ -25,9 +25,7 @@ from timeside.analyzer.utils import melFilterBank, computeModulation
 from timeside.analyzer.utils import segmentFromValues
 from timeside.analyzer import IRITDiverg
 from timeside.api import IAnalyzer
-from numpy import logical_and,array, hamming, dot, mean, float, arange, nonzero
-from numpy.fft import rfft
-from scipy.signal import firwin, lfilter
+from numpy import mean, diff, arange
 from timeside.analyzer.preprocessors import frames_adapter
 
 
@@ -40,10 +38,10 @@ class IRITMusicSLN(Analyzer):
         self.parents.append(IRITDiverg())
         self.wLen      = 1.0
         self.wStep     = 0.1
-        self.threshold = 20
+        self.threshold = 0.05
         self.input_blocksize = 0;
-        self.input_stepsize = 0;      
-
+        self.input_stepsize = 0;
+        self.maxSegForLength = 7
     @interfacedoc
     def setup(self, channels=None, samplerate=None, blocksize=None,
               totalframes=None):
@@ -78,21 +76,19 @@ class IRITMusicSLN(Analyzer):
         '''
 
         '''
+
+        segList = self.process_pipe.results['irit_diverg.segments'].time
         
-        segList = self.process_pipe.results['irit_diverg.segments'].time  
-        w = self.wLen/ 2;
+        w = self.wLen/ 2
         end = segList[-1]
-        tLine =  arange(0,end,self.wStep)
         
-        segLen = array([0]*len(tLine))
-        
-        for i,t in enumerate(tLine):
-            idx = nonzero(logical_and(segList>(t-w) ,segList<(t+w)))[0]
-            segLen[i]= len(idx)
+        tLine = arange(w,end-w,self.wStep)
         
+        #  Les plus petits  ! <> article
+        segLen         = [mean(diff(getBoundariesInInterval(t-w, t+w, segList))) for t in tLine]
+
                # Confidence Index
-        conf = array(segLen - self.threshold) / self.threshold
-        conf[conf > 1] = 1
+        conf = [(s - self.threshold) / self.threshold if s < 2*self.threshold else 1 for s in segLen]
 
         segLenRes = self.new_result(data_mode='value', time_mode='framewise')
         segLenRes.id_metadata.id += '.' + 'energy_confidence'
@@ -106,7 +102,7 @@ class IRITMusicSLN(Analyzer):
         convert = {False: 0, True: 1}
         label = {0: 'nonMusic', 1: 'Music'}
 
-        segList = segmentFromValues(segLen > self.threshold)
+        segList = segmentFromValues([s > self.threshold for s in segLen])
         # Hint : Median filtering could imrove smoothness of the result
         # from scipy.signal import medfilt
         # segList = segmentFromValues(medfilt(modEnergyValue > self.threshold, 31))
@@ -123,3 +119,9 @@ class IRITMusicSLN(Analyzer):
 
         self.process_pipe.results.add(segs)
         return
+
+
+def getBoundariesInInterval(start,stop,boundaries) :
+    return [t for t in boundaries if t >= start and t<= stop]
+
+
index 900b72e124bb55e06633efc663f661058a6f05c0..98caee3ae9bb23ae6b8e2d3d4f4eb4d83fa5e726 100644 (file)
@@ -25,9 +25,7 @@ from timeside.analyzer.utils import melFilterBank, computeModulation
 from timeside.analyzer.utils import segmentFromValues
 from timeside.analyzer import IRITDiverg
 from timeside.api import IAnalyzer
-from numpy import logical_and,array, hamming, dot, mean, float, arange, nonzero
-from numpy.fft import rfft
-from scipy.signal import firwin, lfilter
+from numpy import array, mean, arange, nonzero
 from timeside.analyzer.preprocessors import frames_adapter
 
 
@@ -80,26 +78,19 @@ class IRITMusicSNB(Analyzer):
         '''
         
         segList = self.process_pipe.results['irit_diverg.segments'].time  
-        w = self.wLen/ 2;
+        w = self.wLen/ 2
         end = segList[-1]
-        tLine =  arange(0,end,self.wStep)
+        tLine = arange(0, end, self.wStep)
         
-        segLen = array([0]*len(tLine))
-        
-        for i,t in enumerate(tLine):
-            idx = nonzero(logical_and(segList>(t-w) ,segList<(t+w)))[0]
-            l = [tLine[t1]-tLine[t2] for t1,t2 in zip()]
-            segLen[i]= len(idx)
+        segNB = [ len(getBoundariesInInterval(t-w,t+w,segList)) for t in tLine ]
         
        # Confidence Index
-        conf = array(segLen - self.threshold) / self.threshold
-        conf[conf > 1] = 1
-
+        conf = [float(v - self.threshold) / float(self.threshold) if v < 2*self.threshold else 1.0 for v in segNB]
         segLenRes = self.new_result(data_mode='value', time_mode='framewise')
         segLenRes.id_metadata.id += '.' + 'energy_confidence'
         segLenRes.id_metadata.name += ' ' + 'Energy Confidence'
 
-        segLenRes.data_object.value = segLen
+        segLenRes.data_object.value = conf
 
         self.process_pipe.results.add(segLenRes)
 
@@ -107,7 +98,7 @@ class IRITMusicSNB(Analyzer):
         convert = {False: 0, True: 1}
         label = {0: 'nonMusic', 1: 'Music'}
 
-        segList = segmentFromValues(segLen > self.threshold)
+        segList = segmentFromValues([c > 0 for c in conf])
         # Hint : Median filtering could imrove smoothness of the result
         # from scipy.signal import medfilt
         # segList = segmentFromValues(medfilt(modEnergyValue > self.threshold, 31))
@@ -124,3 +115,7 @@ class IRITMusicSNB(Analyzer):
 
         self.process_pipe.results.add(segs)
         return
+
+def getBoundariesInInterval(start, stop, boundaries) :
+    return [t for t in boundaries if t >= start and t<= stop]
+
diff --git a/timeside/analyzer/irit_speech_4hz.py.orig b/timeside/analyzer/irit_speech_4hz.py.orig
new file mode 100644 (file)
index 0000000..6acac1c
--- /dev/null
@@ -0,0 +1,178 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2013 Maxime Le Coz <lecoz@irit.fr>
+
+# This file is part of TimeSide.
+
+# TimeSide is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 2 of the License, or
+# (at your option) any later version.
+
+# TimeSide is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with TimeSide.  If not, see <http://www.gnu.org/licenses/>.
+
+# Author: Maxime Le Coz <lecoz@irit.fr>
+
+from timeside.core import implements, interfacedoc
+from timeside.analyzer.core import Analyzer
+from timeside.analyzer.utils import melFilterBank, computeModulation
+from timeside.analyzer.utils import segmentFromValues
+from timeside.api import IAnalyzer
+from numpy import array, hamming, dot, mean, float
+from numpy.fft import rfft
+from scipy.signal import firwin, lfilter
+from timeside.analyzer.preprocessors import frames_adapter
+
+
+class IRITSpeech4Hz(Analyzer):
+    '''Speech Segmentor based on the 4Hz energy modulation analysis.
+
+    Properties:
+        - energy4hz            (list)          : List of the 4Hz energy by frame for the modulation computation
+        - threshold            (float)         : Threshold for the classification Speech/NonSpeech
+        - frequency_center     (float)         : Center of the frequency range where the energy is extracted
+        - frequency_width      (float)         : Width of the frequency range where the energy is extracted
+        - orderFilter          (int)           : Order of the pass-band filter extracting the frequency range
+        - normalizeEnergy      (boolean)       : Whether the energy must be normalized or not
+        - nFFT                                 (int)           : Number of points for the FFT. Better if 512 <= nFFT <= 2048
+        - nbFilters                    (int)           : Length of the Mel Filter bank
+        - melFilter            (numpy array)   : Mel Filter bank
+        - modulLen                     (float)         : Length (in second) of the modulation computation window
+    '''
+
+<<<<<<< HEAD
+    @interfacedoc    
+=======
+    implements(IAnalyzer)
+
+    @interfacedoc
+>>>>>>> 7c3ccb1c5b87c4639fee32df595cca1991265657
+    def setup(self, channels=None, samplerate=None, blocksize=None,
+              totalframes=None):
+        super(IRITSpeech4Hz, self).setup(
+            channels, samplerate, blocksize, totalframes)
+            
+        self.energy4hz = []
+        # Classification
+        self.threshold = 2.0
+        
+        self.wLen      = 1.0
+        self.wStep     = 0.1
+        self.input_blocksize = int(self.wLen * samplerate)
+        self.input_stepsize = int(self.wStep * samplerate)        
+
+        # Pass-band Filter
+        self.frequency_center = 4.0
+        self.frequency_width = 0.5
+        self.orderFilter = 100
+
+        self.normalizeEnergy = True
+        self.nFFT = 2048
+        self.nbFilters = 30
+        self.modulLen = 2.0
+        self.melFilter = melFilterBank(self.nbFilters, self.nFFT, samplerate)
+
+    @staticmethod
+    @interfacedoc
+    def id():
+        return "irit_speech_4hz"
+
+    @staticmethod
+    @interfacedoc
+    def name():
+        return "IRIT Speech 4Hz Modulation"
+
+    @staticmethod
+    @interfacedoc
+    def unit():
+        return ""
+
+    def __str__(self):
+        return "Speech confidences indexes"
+
+    @frames_adapter
+    def process(self, frames, eod=False):
+        frames = frames.T[0]
+        # windowing of the frame (could be a changeable property)
+        w = frames * hamming(len(frames))
+
+        # Mel scale spectrum extraction
+        f = abs(rfft(w, n=2 * self.nFFT)[0:self.nFFT])
+        e = dot(f ** 2, self.melFilter)
+
+        self.energy4hz.append(e)
+        
+        return frames, eod
+
+    def post_process(self):
+        '''
+
+        '''
+        # Creation of the pass-band filter
+        Wo = self.frequency_center / self.samplerate()
+        Wn = [Wo - (self.frequency_width / 2) / self.samplerate(),
+              Wo + (self.frequency_width / 2) / self.samplerate()]
+        num = firwin(self.orderFilter, Wn, pass_zero=False)
+
+        # Energy on the frequency range
+        self.energy4hz = array(self.energy4hz)
+        energy = lfilter(num, 1, self.energy4hz.T, 0)
+        energy = sum(energy)
+
+        # Normalization
+        if self.normalizeEnergy:
+            energy = energy / mean(energy)
+
+        # Energy Modulation
+        frameLenModulation = int(
+            self.modulLen * self.samplerate() / self.input_blocksize)
+        modEnergyValue = computeModulation(energy, frameLenModulation, True)
+
+        # Confidence Index
+        conf = array(modEnergyValue - self.threshold) / self.threshold
+        conf[conf > 1] = 1
+
+        modEnergy = self.new_result(data_mode='value', time_mode='framewise')
+        modEnergy.id_metadata.id += '.' + 'energy_confidence'
+        modEnergy.id_metadata.name += ' ' + 'Energy Confidence'
+
+        modEnergy.data_object.value = conf
+
+        self.process_pipe.results.add(modEnergy)
+
+        # Segment
+        convert = {False: 0, True: 1}
+        label = {0: 'nonSpeech', 1: 'Speech'}
+
+        segList = segmentFromValues(modEnergyValue > self.threshold)
+        # Hint : Median filtering could imrove smoothness of the result
+        # from scipy.signal import medfilt
+        # segList = segmentFromValues(medfilt(modEnergyValue > self.threshold, 31))
+
+        segs = self.new_result(data_mode='label', time_mode='segment')
+        segs.id_metadata.id += '.' + 'segments'
+        segs.id_metadata.name += ' ' + 'Segments'
+
+        segs.label_metadata.label = label
+
+        segs.data_object.label = [convert[s[2]] for s in segList]
+        segs.data_object.time = [(float(s[0]) * self.input_blocksize /
+                                 self.samplerate())
+                                 for s in segList]
+<<<<<<< HEAD
+        segs.data_object.duration = [(float(s[1]-s[0]) * self.input_blocksize /
+=======
+        segs.data_object.duration = [(float(s[1]-s[0]+1) * self.blocksize() /
+>>>>>>> 7c3ccb1c5b87c4639fee32df595cca1991265657
+                                     self.samplerate())
+                                     for s in segList]
+
+        self.process_pipe.results.add(segs)
+
+        return
index 6827d24612a3740cabfa16faaf6dcdf42ade921b..1d91cbe3fa983698a8f79fa462a030f0cead6706 100644 (file)
@@ -74,13 +74,14 @@ class IRITSpeechEntropy(Analyzer):
 
     def post_process(self):
         entropyValue = array(self.entropyValue)
+
+       import pylab
+       pylab.plot(entropyValue)
+       pylab.show() 
         w = self.modulLen/self.wStep
-       print w,len(entropyValue)
         modulentropy = computeModulation(entropyValue, w, False)
 
-       import pylab
-       pylab.plot(modulentropy)
-       pylab.show()
+
 
         confEntropy = array(modulentropy - self.threshold) / self.threshold
         confEntropy[confEntropy > 1] = 1
diff --git a/timeside/analyzer/irit_speech_entropy.py.orig b/timeside/analyzer/irit_speech_entropy.py.orig
new file mode 100644 (file)
index 0000000..45be006
--- /dev/null
@@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2013 Maxime Le Coz <lecoz@irit.fr>
+
+# This file is part of TimeSide.
+
+# TimeSide is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 2 of the License, or
+# (at your option) any later version.
+
+# TimeSide is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with TimeSide.  If not, see <http://www.gnu.org/licenses/>.
+
+# Author: Maxime Le Coz <lecoz@irit.fr>
+
+from timeside.core import implements, interfacedoc
+from timeside.analyzer.core import Analyzer
+from timeside.analyzer.utils import entropy, computeModulation
+from timeside.analyzer.utils import segmentFromValues
+from timeside.api import IAnalyzer
+from numpy import array
+from scipy.ndimage.morphology import binary_opening
+from timeside.analyzer.preprocessors import frames_adapter
+
+
+class IRITSpeechEntropy(Analyzer):
+    """Speech Segmentor based on Entropy analysis."""
+
+    implements(IAnalyzer)
+
+    @interfacedoc
+    def setup(self, channels=None, samplerate=None, blocksize=None,
+              totalframes=None):
+        super(IRITSpeechEntropy, self).setup(
+            channels, samplerate, blocksize, totalframes)
+        self.entropyValue = []
+        self.threshold = 0.4
+        self.smoothLen = 5
+        self.modulLen = 2
+        self.wLen      = 1.0
+        self.wStep     = 0.1
+        self.input_blocksize = int(self.wLen * samplerate)
+        self.input_stepsize = int(self.wStep * samplerate)        
+
+    @staticmethod
+    @interfacedoc
+    def id():
+        return "irit_speech_entropy"
+
+    @staticmethod
+    @interfacedoc
+    def name():
+        return "IRIT Speech entropy"
+
+    @staticmethod
+    @interfacedoc
+    def unit():
+        return ""
+
+    def __str__(self):
+        return "Speech confidences indexes"
+        
+    @frames_adapter
+    def process(self, frames, eod=False):
+        self.entropyValue.append(entropy(frames))
+        return frames, eod
+
+    def post_process(self):
+        entropyValue = array(self.entropyValue)
+        w = self.modulLen * self.samplerate() / self.input_blocksize
+        modulentropy = computeModulation(entropyValue, w, False)
+        confEntropy = array(modulentropy - self.threshold) / self.threshold
+        confEntropy[confEntropy > 1] = 1
+
+        conf = self.new_result(data_mode='value', time_mode='framewise')
+
+        conf.id_metadata.id += '.' + 'confidence'
+        conf.id_metadata.name += ' ' + 'Confidence'
+
+        conf.data_object.value = confEntropy
+        self.process_pipe.results.add(conf)
+
+        # Binary Entropy
+        binaryEntropy = modulentropy > self.threshold
+        binaryEntropy = binary_opening(
+            binaryEntropy, [1] * (self.smoothLen * 2))
+
+        convert = {False: 0, True: 1}
+        label = {0: 'NonSpeech', 1: 'Speech'}
+        segList = segmentFromValues(binaryEntropy)
+
+        segs = self.new_result(data_mode='label', time_mode='segment')
+        segs.id_metadata.id += '.' + 'segments'
+        segs.id_metadata.name += ' ' + 'Segments'
+
+        segs.label_metadata.label = label
+
+        segs.data_object.label = [convert[s[2]] for s in segList]
+        segs.data_object.time = [(float(s[0]) * self.input_blocksize /
+                                 self.samplerate())
+                                 for s in segList]
+<<<<<<< HEAD
+        segs.data_object.duration = [(float(s[1]-s[0]) * self.input_blocksize /
+=======
+        segs.data_object.duration = [(float(s[1]-s[0]+1) * self.blocksize() /
+>>>>>>> 7c3ccb1c5b87c4639fee32df595cca1991265657
+                                     self.samplerate())
+                                     for s in segList]
+
+        self.process_pipe.results.add(segs)
+
+        return
diff --git a/timeside/analyzer/startShapes.dat b/timeside/analyzer/startShapes.dat
new file mode 100644 (file)
index 0000000..f094a29
Binary files /dev/null and b/timeside/analyzer/startShapes.dat differ