From: Thomas Fillon Date: Tue, 21 Oct 2014 18:42:09 +0000 (+0200) Subject: merge dev into diadems X-Git-Url: https://git.parisson.com/?a=commitdiff_plain;h=c6134002542583c15bb55ff1815a9dbb8b3f9637;p=timeside.git merge dev into diadems --- c6134002542583c15bb55ff1815a9dbb8b3f9637 diff --cc setup.py index 052b149,ea28ff2..f925882 --- a/setup.py +++ b/setup.py @@@ -9,10 -9,9 +9,10 @@@ from setuptools.command.test import tes # Pytest class PyTest(TestCommand): + def finalize_options(self): TestCommand.finalize_options(self) - self.test_args = ['tests', '--ignore', 'tests/sandbox'] + self.test_args = ['tests', '--ignore', 'tests/sandbox', '--verbose'] self.test_suite = True def run_tests(self): diff --cc timeside/analyzer/externals/aubio_melenergy.py index 0000000,a22055f..1fcfe48 mode 000000,100644..100644 --- a/timeside/analyzer/externals/aubio_melenergy.py +++ b/timeside/analyzer/externals/aubio_melenergy.py @@@ -1,0 -1,82 +1,81 @@@ + # -*- coding: utf-8 -*- + # + # Copyright (c) 2013 Paul Brossier + + # This file is part of TimeSide. + + # TimeSide is free software: you can redistribute it and/or modify + # it under the terms of the GNU General Public License as published by + # the Free Software Foundation, either version 2 of the License, or + # (at your option) any later version. + + # TimeSide is distributed in the hope that it will be useful, + # but WITHOUT ANY WARRANTY; without even the implied warranty of + # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + # GNU General Public License for more details. + + # You should have received a copy of the GNU General Public License + # along with TimeSide. If not, see . + + # Author: Paul Brossier + from __future__ import absolute_import + + from ...core import implements, interfacedoc + from ..core import Analyzer + from ...api import IAnalyzer + from ..preprocessors import downmix_to_mono, frames_adapter + from aubio import filterbank, pvoc + - + class AubioMelEnergy(Analyzer): + + """Aubio Mel Energy analyzer""" + implements(IAnalyzer) + + def __init__(self): + super(AubioMelEnergy, self).__init__() + self.input_blocksize = 1024 + self.input_stepsize = self.input_blocksize / 4 + + @interfacedoc + def setup(self, channels=None, samplerate=None, + blocksize=None, totalframes=None): + super(AubioMelEnergy, self).setup( + channels, samplerate, blocksize, totalframes) + self.n_filters = 40 + self.n_coeffs = 13 + self.pvoc = pvoc(self.input_blocksize, self.input_stepsize) + self.melenergy = filterbank(self.n_filters, self.input_blocksize) + self.melenergy.set_mel_coeffs_slaney(samplerate) + self.block_read = 0 + self.melenergy_results = [] + + @staticmethod + @interfacedoc + def id(): + return "aubio_melenergy" + + @staticmethod + @interfacedoc + def name(): + return "Mel Energy (aubio)" + + @staticmethod + @interfacedoc + def unit(): + return "" + + @downmix_to_mono + @frames_adapter + def process(self, frames, eod=False): + + fftgrain = self.pvoc(frames) + self.melenergy_results.append(self.melenergy(fftgrain)) + self.block_read += 1 + return frames, eod + + def post_process(self): + melenergy = self.new_result(data_mode='value', time_mode='framewise') + melenergy.parameters = dict(n_filters=self.n_filters, + n_coeffs=self.n_coeffs) + melenergy.data_object.value = self.melenergy_results + self.add_result(melenergy) diff --cc timeside/analyzer/limsi_sad.py index 7396965,7762f96..ce492d1 --- a/timeside/analyzer/limsi_sad.py +++ b/timeside/analyzer/limsi_sad.py @@@ -30,11 -30,15 +30,17 @@@ import numpy as n import pickle import os.path + # Require Yaafe + if not timeside._WITH_YAAFE: + raise ImportError('yaafelib must be missing') + class GMM: ++ """ Gaussian Mixture Model """ ++ def __init__(self, weights, means, vars): self.weights = weights self.means = means @@@ -47,9 -51,9 +53,9 @@@ - 2 * np.dot(x, (self.means / self.vars).T) + np.dot(x ** 2, (1.0 / self.vars).T)) + np.log(self.weights) -- m = np.amax(llh,1) ++ m = np.amax(llh, 1) dif = llh - np.atleast_2d(m).T -- return m + np.log(np.sum(np.exp(dif),1)) ++ return m + np.log(np.sum(np.exp(dif), 1)) def slidewinmap(lin, winsize, func): @@@ -65,19 -69,19 +71,22 @@@ winsize: size of the sliding windows in samples (int) func: function to be mapped on sliding windows """ -- tmpin = ([lin[0]] * (winsize/2)) + list(lin) + ([lin[-1]] * (winsize -1 - winsize/2)) ++ tmpin = ([lin[0]] * (winsize / 2)) + list(lin) + \ ++ ([lin[-1]] * (winsize - 1 - winsize / 2)) lout = [] for i in xrange(len(lin)): -- lout.append(func(tmpin[i:(i+winsize)])) ++ lout.append(func(tmpin[i:(i + winsize)])) assert(len(lin) == len(lout)) return lout ++ def dilatation(lin, winsize): """ morphological dilation """ return slidewinmap(lin, winsize, max) ++ def erosion(lin, winsize): """ morphological erosion @@@ -86,6 -90,6 +95,7 @@@ class LimsiSad(Analyzer): ++ """ Limsi Speech Activity Detection Systems LimsiSad performs frame level speech activity detection based on trained GMM models @@@ -145,10 -149,10 +155,11 @@@ super(LimsiSad, self).__init__() # feature extraction defition -- feature_plan = ['mfcc: MFCC CepsIgnoreFirstCoeff=0 blockSize=1024 stepSize=256', -- 'mfccd1: MFCC CepsIgnoreFirstCoeff=0 blockSize=1024 stepSize=256 > Derivate DOrder=1', -- 'mfccd2: MFCC CepsIgnoreFirstCoeff=0 blockSize=1024 stepSize=256 > Derivate DOrder=2', -- 'zcr: ZCR blockSize=1024 stepSize=256'] ++ feature_plan = [ ++ 'mfcc: MFCC CepsIgnoreFirstCoeff=0 blockSize=1024 stepSize=256', ++ 'mfccd1: MFCC CepsIgnoreFirstCoeff=0 blockSize=1024 stepSize=256 > Derivate DOrder=1', ++ 'mfccd2: MFCC CepsIgnoreFirstCoeff=0 blockSize=1024 stepSize=256 > Derivate DOrder=2', ++ 'zcr: ZCR blockSize=1024 stepSize=256'] yaafe_analyzer = get_processor('yaafe') self.parents['yaafe'] = yaafe_analyzer(feature_plan=feature_plan, input_samplerate=16000) @@@ -205,7 -209,7 +216,8 @@@ features = np.concatenate((mfcc, mfccd1, mfccd2, zcr), axis=1) # compute log likelihood difference -- res = 0.5 + 0.5 * (self.gmms[0].llh(features) - self.gmms[1].llh(features)) ++ res = 0.5 + 0.5 * \ ++ (self.gmms[0].llh(features) - self.gmms[1].llh(features)) # bounds log likelihood difference if self.dllh_bounds is not None: @@@ -213,8 -217,8 +225,10 @@@ res = np.minimum(np.maximum(res, mindiff), maxdiff) # performs dilation, erosion, erosion, dilatation -- ws = int(self.dews * float(self.input_samplerate ) / self.input_stepsize) -- deed_llh = dilatation(erosion(erosion(dilatation(res, ws), ws), ws), ws) ++ ws = int( ++ self.dews * float(self.input_samplerate) / self.input_stepsize) ++ deed_llh = dilatation( ++ erosion(erosion(dilatation(res, ws), ws), ws), ws) # infer speech and non speech segments from dilated # and erroded likelihood difference estimate @@@ -222,7 -226,7 +236,8 @@@ labels = [] times = [] durations = [] -- for i, val in enumerate([1 if e > self.speech_threshold else 0 for e in deed_llh]): ++ for i, val in enumerate([1 if e > self.speech_threshold else 0 ++ for e in deed_llh]): if val != last: labels.append(val) durations.append(1) @@@ -230,32 -234,32 +245,40 @@@ else: durations[-1] += 1 last = val -- times = [(float(e) * self.input_stepsize) / self.input_samplerate for e in times] -- durations = [(float(e) * self.input_stepsize) / self.input_samplerate for e in durations] -- ++ times = [(float(e) * self.input_stepsize) ++ / self.input_samplerate for e in times] ++ durations = [(float(e) * self.input_stepsize) ++ / self.input_samplerate for e in durations] -- # outputs the raw frame level speech/non speech log likelihood difference ++ # outputs the raw frame level speech/non speech log likelihood ++ # difference sad_result = self.new_result(data_mode='value', time_mode='framewise') sad_result.id_metadata.id += '.' + 'sad_lhh_diff' -- sad_result.id_metadata.name += ' ' + 'Speech Activity Detection Log Likelihood Difference' ++ sad_result.id_metadata.name += ' ' + \ ++ 'Speech Activity Detection Log Likelihood Difference' sad_result.data_object.value = res self.add_result(sad_result) # outputs frame level speech/non speech log likelihood difference # altered with erosion and dilatation procedures -- sad_de_result = self.new_result(data_mode='value', time_mode='framewise') ++ sad_de_result = self.new_result( ++ data_mode='value', time_mode='framewise') sad_de_result.id_metadata.id += '.' + 'sad_de_lhh_diff' -- sad_de_result.id_metadata.name += ' ' + 'Speech Activity Detection Log Likelihood Difference | dilat | erode' ++ sad_de_result.id_metadata.name += ' ' + \ ++ 'Speech Activity Detection Log Likelihood Difference | dilat | erode' sad_de_result.data_object.value = deed_llh self.add_result(sad_de_result) # outputs speech/non speech segments -- sad_seg_result = self.new_result(data_mode='label', time_mode='segment') ++ sad_seg_result = self.new_result( ++ data_mode='label', time_mode='segment') sad_seg_result.id_metadata.id += '.' + 'sad_segments' -- sad_seg_result.id_metadata.name += ' ' + 'Speech Activity Detection Segments' ++ sad_seg_result.id_metadata.name += ' ' + \ ++ 'Speech Activity Detection Segments' sad_seg_result.data_object.label = labels sad_seg_result.data_object.time = times sad_seg_result.data_object.duration = durations -- sad_seg_result.data_object.label_metadata.label = {0: 'Not Speech', 1: 'Speech'} ++ sad_seg_result.data_object.label_metadata.label = { ++ 0: 'Not Speech', 1: 'Speech'} self.add_result(sad_seg_result)