From: Maxime LE COZ Date: Tue, 11 Mar 2014 15:26:56 +0000 (+0100) Subject: Optimised music detector X-Git-Url: https://git.parisson.com/?a=commitdiff_plain;h=b66174bde406242ac624d3a59c89dc8eb49bb8d6;p=timeside.git Optimised music detector --- diff --git a/timeside/analyzer/core.py b/timeside/analyzer/core.py index f0e649f..9424f67 100644 --- a/timeside/analyzer/core.py +++ b/timeside/analyzer/core.py @@ -1072,7 +1072,7 @@ class Analyzer(Processor): # Automatically write known metadata result.id_metadata.date = datetime.now().replace( microsecond=0).isoformat(' ') - result.id_metadata.version = timeside.__version__ + #result.id_metadata.version = timeside.__version__ result.id_metadata.author = 'TimeSide' result.id_metadata.id = self.id() result.id_metadata.name = self.name() diff --git a/timeside/analyzer/irit_music_SLN.py b/timeside/analyzer/irit_music_SLN.py index 814cc83..ab1e714 100644 --- a/timeside/analyzer/irit_music_SLN.py +++ b/timeside/analyzer/irit_music_SLN.py @@ -25,9 +25,7 @@ from timeside.analyzer.utils import melFilterBank, computeModulation from timeside.analyzer.utils import segmentFromValues from timeside.analyzer import IRITDiverg from timeside.api import IAnalyzer -from numpy import logical_and,array, hamming, dot, mean, float, arange, nonzero -from numpy.fft import rfft -from scipy.signal import firwin, lfilter +from numpy import mean, diff, arange from timeside.analyzer.preprocessors import frames_adapter @@ -40,10 +38,10 @@ class IRITMusicSLN(Analyzer): self.parents.append(IRITDiverg()) self.wLen = 1.0 self.wStep = 0.1 - self.threshold = 20 + self.threshold = 0.05 self.input_blocksize = 0; - self.input_stepsize = 0; - + self.input_stepsize = 0; + self.maxSegForLength = 7 @interfacedoc def setup(self, channels=None, samplerate=None, blocksize=None, totalframes=None): @@ -78,21 +76,19 @@ class IRITMusicSLN(Analyzer): ''' ''' + + segList = self.process_pipe.results['irit_diverg.segments'].time - segList = self.process_pipe.results['irit_diverg.segments'].time - w = self.wLen/ 2; + w = self.wLen/ 2 end = segList[-1] - tLine = arange(0,end,self.wStep) - segLen = array([0]*len(tLine)) - - for i,t in enumerate(tLine): - idx = nonzero(logical_and(segList>(t-w) ,segList<(t+w)))[0] - segLen[i]= len(idx) + tLine = arange(w,end-w,self.wStep) + # Les plus petits ! <> article + segLen = [mean(diff(getBoundariesInInterval(t-w, t+w, segList))) for t in tLine] + # Confidence Index - conf = array(segLen - self.threshold) / self.threshold - conf[conf > 1] = 1 + conf = [(s - self.threshold) / self.threshold if s < 2*self.threshold else 1 for s in segLen] segLenRes = self.new_result(data_mode='value', time_mode='framewise') segLenRes.id_metadata.id += '.' + 'energy_confidence' @@ -106,7 +102,7 @@ class IRITMusicSLN(Analyzer): convert = {False: 0, True: 1} label = {0: 'nonMusic', 1: 'Music'} - segList = segmentFromValues(segLen > self.threshold) + segList = segmentFromValues([s > self.threshold for s in segLen]) # Hint : Median filtering could imrove smoothness of the result # from scipy.signal import medfilt # segList = segmentFromValues(medfilt(modEnergyValue > self.threshold, 31)) @@ -123,3 +119,9 @@ class IRITMusicSLN(Analyzer): self.process_pipe.results.add(segs) return + + +def getBoundariesInInterval(start,stop,boundaries) : + return [t for t in boundaries if t >= start and t<= stop] + + diff --git a/timeside/analyzer/irit_music_SNB.py b/timeside/analyzer/irit_music_SNB.py index 900b72e..98caee3 100644 --- a/timeside/analyzer/irit_music_SNB.py +++ b/timeside/analyzer/irit_music_SNB.py @@ -25,9 +25,7 @@ from timeside.analyzer.utils import melFilterBank, computeModulation from timeside.analyzer.utils import segmentFromValues from timeside.analyzer import IRITDiverg from timeside.api import IAnalyzer -from numpy import logical_and,array, hamming, dot, mean, float, arange, nonzero -from numpy.fft import rfft -from scipy.signal import firwin, lfilter +from numpy import array, mean, arange, nonzero from timeside.analyzer.preprocessors import frames_adapter @@ -80,26 +78,19 @@ class IRITMusicSNB(Analyzer): ''' segList = self.process_pipe.results['irit_diverg.segments'].time - w = self.wLen/ 2; + w = self.wLen/ 2 end = segList[-1] - tLine = arange(0,end,self.wStep) + tLine = arange(0, end, self.wStep) - segLen = array([0]*len(tLine)) - - for i,t in enumerate(tLine): - idx = nonzero(logical_and(segList>(t-w) ,segList<(t+w)))[0] - l = [tLine[t1]-tLine[t2] for t1,t2 in zip()] - segLen[i]= len(idx) + segNB = [ len(getBoundariesInInterval(t-w,t+w,segList)) for t in tLine ] # Confidence Index - conf = array(segLen - self.threshold) / self.threshold - conf[conf > 1] = 1 - + conf = [float(v - self.threshold) / float(self.threshold) if v < 2*self.threshold else 1.0 for v in segNB] segLenRes = self.new_result(data_mode='value', time_mode='framewise') segLenRes.id_metadata.id += '.' + 'energy_confidence' segLenRes.id_metadata.name += ' ' + 'Energy Confidence' - segLenRes.data_object.value = segLen + segLenRes.data_object.value = conf self.process_pipe.results.add(segLenRes) @@ -107,7 +98,7 @@ class IRITMusicSNB(Analyzer): convert = {False: 0, True: 1} label = {0: 'nonMusic', 1: 'Music'} - segList = segmentFromValues(segLen > self.threshold) + segList = segmentFromValues([c > 0 for c in conf]) # Hint : Median filtering could imrove smoothness of the result # from scipy.signal import medfilt # segList = segmentFromValues(medfilt(modEnergyValue > self.threshold, 31)) @@ -124,3 +115,7 @@ class IRITMusicSNB(Analyzer): self.process_pipe.results.add(segs) return + +def getBoundariesInInterval(start, stop, boundaries) : + return [t for t in boundaries if t >= start and t<= stop] + diff --git a/timeside/analyzer/irit_speech_4hz.py.orig b/timeside/analyzer/irit_speech_4hz.py.orig new file mode 100644 index 0000000..6acac1c --- /dev/null +++ b/timeside/analyzer/irit_speech_4hz.py.orig @@ -0,0 +1,178 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 Maxime Le Coz + +# This file is part of TimeSide. + +# TimeSide is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 2 of the License, or +# (at your option) any later version. + +# TimeSide is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with TimeSide. If not, see . + +# Author: Maxime Le Coz + +from timeside.core import implements, interfacedoc +from timeside.analyzer.core import Analyzer +from timeside.analyzer.utils import melFilterBank, computeModulation +from timeside.analyzer.utils import segmentFromValues +from timeside.api import IAnalyzer +from numpy import array, hamming, dot, mean, float +from numpy.fft import rfft +from scipy.signal import firwin, lfilter +from timeside.analyzer.preprocessors import frames_adapter + + +class IRITSpeech4Hz(Analyzer): + '''Speech Segmentor based on the 4Hz energy modulation analysis. + + Properties: + - energy4hz (list) : List of the 4Hz energy by frame for the modulation computation + - threshold (float) : Threshold for the classification Speech/NonSpeech + - frequency_center (float) : Center of the frequency range where the energy is extracted + - frequency_width (float) : Width of the frequency range where the energy is extracted + - orderFilter (int) : Order of the pass-band filter extracting the frequency range + - normalizeEnergy (boolean) : Whether the energy must be normalized or not + - nFFT (int) : Number of points for the FFT. Better if 512 <= nFFT <= 2048 + - nbFilters (int) : Length of the Mel Filter bank + - melFilter (numpy array) : Mel Filter bank + - modulLen (float) : Length (in second) of the modulation computation window + ''' + +<<<<<<< HEAD + @interfacedoc +======= + implements(IAnalyzer) + + @interfacedoc +>>>>>>> 7c3ccb1c5b87c4639fee32df595cca1991265657 + def setup(self, channels=None, samplerate=None, blocksize=None, + totalframes=None): + super(IRITSpeech4Hz, self).setup( + channels, samplerate, blocksize, totalframes) + + self.energy4hz = [] + # Classification + self.threshold = 2.0 + + self.wLen = 1.0 + self.wStep = 0.1 + self.input_blocksize = int(self.wLen * samplerate) + self.input_stepsize = int(self.wStep * samplerate) + + # Pass-band Filter + self.frequency_center = 4.0 + self.frequency_width = 0.5 + self.orderFilter = 100 + + self.normalizeEnergy = True + self.nFFT = 2048 + self.nbFilters = 30 + self.modulLen = 2.0 + self.melFilter = melFilterBank(self.nbFilters, self.nFFT, samplerate) + + @staticmethod + @interfacedoc + def id(): + return "irit_speech_4hz" + + @staticmethod + @interfacedoc + def name(): + return "IRIT Speech 4Hz Modulation" + + @staticmethod + @interfacedoc + def unit(): + return "" + + def __str__(self): + return "Speech confidences indexes" + + @frames_adapter + def process(self, frames, eod=False): + frames = frames.T[0] + # windowing of the frame (could be a changeable property) + w = frames * hamming(len(frames)) + + # Mel scale spectrum extraction + f = abs(rfft(w, n=2 * self.nFFT)[0:self.nFFT]) + e = dot(f ** 2, self.melFilter) + + self.energy4hz.append(e) + + return frames, eod + + def post_process(self): + ''' + + ''' + # Creation of the pass-band filter + Wo = self.frequency_center / self.samplerate() + Wn = [Wo - (self.frequency_width / 2) / self.samplerate(), + Wo + (self.frequency_width / 2) / self.samplerate()] + num = firwin(self.orderFilter, Wn, pass_zero=False) + + # Energy on the frequency range + self.energy4hz = array(self.energy4hz) + energy = lfilter(num, 1, self.energy4hz.T, 0) + energy = sum(energy) + + # Normalization + if self.normalizeEnergy: + energy = energy / mean(energy) + + # Energy Modulation + frameLenModulation = int( + self.modulLen * self.samplerate() / self.input_blocksize) + modEnergyValue = computeModulation(energy, frameLenModulation, True) + + # Confidence Index + conf = array(modEnergyValue - self.threshold) / self.threshold + conf[conf > 1] = 1 + + modEnergy = self.new_result(data_mode='value', time_mode='framewise') + modEnergy.id_metadata.id += '.' + 'energy_confidence' + modEnergy.id_metadata.name += ' ' + 'Energy Confidence' + + modEnergy.data_object.value = conf + + self.process_pipe.results.add(modEnergy) + + # Segment + convert = {False: 0, True: 1} + label = {0: 'nonSpeech', 1: 'Speech'} + + segList = segmentFromValues(modEnergyValue > self.threshold) + # Hint : Median filtering could imrove smoothness of the result + # from scipy.signal import medfilt + # segList = segmentFromValues(medfilt(modEnergyValue > self.threshold, 31)) + + segs = self.new_result(data_mode='label', time_mode='segment') + segs.id_metadata.id += '.' + 'segments' + segs.id_metadata.name += ' ' + 'Segments' + + segs.label_metadata.label = label + + segs.data_object.label = [convert[s[2]] for s in segList] + segs.data_object.time = [(float(s[0]) * self.input_blocksize / + self.samplerate()) + for s in segList] +<<<<<<< HEAD + segs.data_object.duration = [(float(s[1]-s[0]) * self.input_blocksize / +======= + segs.data_object.duration = [(float(s[1]-s[0]+1) * self.blocksize() / +>>>>>>> 7c3ccb1c5b87c4639fee32df595cca1991265657 + self.samplerate()) + for s in segList] + + self.process_pipe.results.add(segs) + + return diff --git a/timeside/analyzer/irit_speech_entropy.py b/timeside/analyzer/irit_speech_entropy.py index 6827d24..1d91cbe 100644 --- a/timeside/analyzer/irit_speech_entropy.py +++ b/timeside/analyzer/irit_speech_entropy.py @@ -74,13 +74,14 @@ class IRITSpeechEntropy(Analyzer): def post_process(self): entropyValue = array(self.entropyValue) + + import pylab + pylab.plot(entropyValue) + pylab.show() w = self.modulLen/self.wStep - print w,len(entropyValue) modulentropy = computeModulation(entropyValue, w, False) - import pylab - pylab.plot(modulentropy) - pylab.show() + confEntropy = array(modulentropy - self.threshold) / self.threshold confEntropy[confEntropy > 1] = 1 diff --git a/timeside/analyzer/irit_speech_entropy.py.orig b/timeside/analyzer/irit_speech_entropy.py.orig new file mode 100644 index 0000000..45be006 --- /dev/null +++ b/timeside/analyzer/irit_speech_entropy.py.orig @@ -0,0 +1,118 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 Maxime Le Coz + +# This file is part of TimeSide. + +# TimeSide is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 2 of the License, or +# (at your option) any later version. + +# TimeSide is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with TimeSide. If not, see . + +# Author: Maxime Le Coz + +from timeside.core import implements, interfacedoc +from timeside.analyzer.core import Analyzer +from timeside.analyzer.utils import entropy, computeModulation +from timeside.analyzer.utils import segmentFromValues +from timeside.api import IAnalyzer +from numpy import array +from scipy.ndimage.morphology import binary_opening +from timeside.analyzer.preprocessors import frames_adapter + + +class IRITSpeechEntropy(Analyzer): + """Speech Segmentor based on Entropy analysis.""" + + implements(IAnalyzer) + + @interfacedoc + def setup(self, channels=None, samplerate=None, blocksize=None, + totalframes=None): + super(IRITSpeechEntropy, self).setup( + channels, samplerate, blocksize, totalframes) + self.entropyValue = [] + self.threshold = 0.4 + self.smoothLen = 5 + self.modulLen = 2 + self.wLen = 1.0 + self.wStep = 0.1 + self.input_blocksize = int(self.wLen * samplerate) + self.input_stepsize = int(self.wStep * samplerate) + + @staticmethod + @interfacedoc + def id(): + return "irit_speech_entropy" + + @staticmethod + @interfacedoc + def name(): + return "IRIT Speech entropy" + + @staticmethod + @interfacedoc + def unit(): + return "" + + def __str__(self): + return "Speech confidences indexes" + + @frames_adapter + def process(self, frames, eod=False): + self.entropyValue.append(entropy(frames)) + return frames, eod + + def post_process(self): + entropyValue = array(self.entropyValue) + w = self.modulLen * self.samplerate() / self.input_blocksize + modulentropy = computeModulation(entropyValue, w, False) + confEntropy = array(modulentropy - self.threshold) / self.threshold + confEntropy[confEntropy > 1] = 1 + + conf = self.new_result(data_mode='value', time_mode='framewise') + + conf.id_metadata.id += '.' + 'confidence' + conf.id_metadata.name += ' ' + 'Confidence' + + conf.data_object.value = confEntropy + self.process_pipe.results.add(conf) + + # Binary Entropy + binaryEntropy = modulentropy > self.threshold + binaryEntropy = binary_opening( + binaryEntropy, [1] * (self.smoothLen * 2)) + + convert = {False: 0, True: 1} + label = {0: 'NonSpeech', 1: 'Speech'} + segList = segmentFromValues(binaryEntropy) + + segs = self.new_result(data_mode='label', time_mode='segment') + segs.id_metadata.id += '.' + 'segments' + segs.id_metadata.name += ' ' + 'Segments' + + segs.label_metadata.label = label + + segs.data_object.label = [convert[s[2]] for s in segList] + segs.data_object.time = [(float(s[0]) * self.input_blocksize / + self.samplerate()) + for s in segList] +<<<<<<< HEAD + segs.data_object.duration = [(float(s[1]-s[0]) * self.input_blocksize / +======= + segs.data_object.duration = [(float(s[1]-s[0]+1) * self.blocksize() / +>>>>>>> 7c3ccb1c5b87c4639fee32df595cca1991265657 + self.samplerate()) + for s in segList] + + self.process_pipe.results.add(segs) + + return diff --git a/timeside/analyzer/startShapes.dat b/timeside/analyzer/startShapes.dat new file mode 100644 index 0000000..f094a29 Binary files /dev/null and b/timeside/analyzer/startShapes.dat differ