From dabdf3ae23135ba82a5e8da3a7bbb8ec2aa8ac9e Mon Sep 17 00:00:00 2001 From: Thomas Fillon Date: Thu, 20 Mar 2014 13:50:11 +0100 Subject: [PATCH] Add a sha1sum computation for decoders sha1sum will be compute for file, url and numpy arrays --- timeside/analyzer/core.py | 6 +++ timeside/decoder/array.py | 3 +- timeside/decoder/core.py | 9 +++- timeside/decoder/file.py | 2 + timeside/decoder/live.py | 1 + timeside/decoder/utils.py | 109 ++++++++++++++++++++++++++++++++------ 6 files changed, 111 insertions(+), 19 deletions(-) diff --git a/timeside/analyzer/core.py b/timeside/analyzer/core.py index b3eafc9..486eb0c 100644 --- a/timeside/analyzer/core.py +++ b/timeside/analyzer/core.py @@ -241,6 +241,10 @@ class AudioMetadata(MetadataObject): Start time of the segment in seconds duration : float Duration of the segment in seconds + is_segment : boolean + Is the media a segment of an audio source + sha1 : str + Sha1 hexadecimal digest of the audio source channels : int Number of channels channelsManagement : str @@ -257,6 +261,7 @@ class AudioMetadata(MetadataObject): ('start', 0), ('duration', None), ('is_segment', None), + ('sha1', ''), ('channels', None), ('channelsManagement', '')]) @@ -1100,6 +1105,7 @@ class Analyzer(Processor): result.id_metadata.uuid = self.uuid() result.audio_metadata.uri = self.mediainfo()['uri'] + result.audio_metadata.sha1 = self.mediainfo()['sha1'] result.audio_metadata.start = self.mediainfo()['start'] result.audio_metadata.duration = self.mediainfo()['duration'] result.audio_metadata.is_segment = self.mediainfo()['is_segment'] diff --git a/timeside/decoder/array.py b/timeside/decoder/array.py index 581603a..bb66dbf 100644 --- a/timeside/decoder/array.py +++ b/timeside/decoder/array.py @@ -72,7 +72,8 @@ class ArrayDecoder(Decoder): self.uri = '_'.join(['raw_audio_array', 'x'.join([str(dim) for dim in samples.shape]), samples.dtype.type.__name__]) - + from .utils import sha1sum_numpy + self._sha1 = sha1sum_numpy(self.samples) self.frames = self.get_frames() def setup(self, channels=None, samplerate=None, blocksize=None): diff --git a/timeside/decoder/core.py b/timeside/decoder/core.py index 1eba577..60355e2 100644 --- a/timeside/decoder/core.py +++ b/timeside/decoder/core.py @@ -32,7 +32,7 @@ from timeside.core import Processor, implements, interfacedoc, abstract from timeside.api import IDecoder from timeside.tools import * -from utils import get_uri, get_media_uri_info, stack +from utils import get_uri, get_media_uri_info, stack, get_sha1 import Queue from gst import _gst as gst @@ -93,7 +93,12 @@ class Decoder(Processor): duration=self.uri_duration, start=self.uri_start, is_segment=self.is_segment, - samplerate=self.input_samplerate) + samplerate=self.input_samplerate, + sha1=self.sha1) + + @property + def sha1(self): + return self._sha1 def __del__(self): self.release() diff --git a/timeside/decoder/file.py b/timeside/decoder/file.py index 9f29b4e..3397227 100644 --- a/timeside/decoder/file.py +++ b/timeside/decoder/file.py @@ -70,6 +70,8 @@ class FileDecoder(Decoder): self.stack = stack self.uri = get_uri(uri) + self._sha1 = get_sha1(uri) + self.uri_total_duration = get_media_uri_info(self.uri)['duration'] self.mimetype = None diff --git a/timeside/decoder/live.py b/timeside/decoder/live.py index fdb596b..9252330 100644 --- a/timeside/decoder/live.py +++ b/timeside/decoder/live.py @@ -85,6 +85,7 @@ class LiveDecoder(Decoder): self.uri_duration = None self.is_segment = False self.input_src = input_src + self._sha1 = '' def setup(self, channels=None, samplerate=None, blocksize=None): diff --git a/timeside/decoder/utils.py b/timeside/decoder/utils.py index 0b21f3e..50e9a71 100644 --- a/timeside/decoder/utils.py +++ b/timeside/decoder/utils.py @@ -26,7 +26,7 @@ from __future__ import division -import numpy +import numpy as np class Noise(object): """A class that mimics audiolab.sndfile but generates noise instead of reading @@ -60,7 +60,7 @@ class Noise(object): else: will_read = frames_to_read self.seekpoint += will_read - return numpy.random.random(will_read)*2 - 1 + return np.random.random(will_read)*2 - 1 def path2uri(path): @@ -78,33 +78,43 @@ def path2uri(path): return urlparse.urljoin('file:', urllib.pathname2url(path)) +def source_info(source): + import os.path + + src_info = {'is_file': False, + 'uri': '', + 'pathname': ''} + + if os.path.exists(source): + src_info['is_file'] = True + # get the absolute path + src_info['pathname'] = os.path.abspath(source) + # and make a uri of it + src_info['uri'] = path2uri(src_info['pathname']) + return src_info + + def get_uri(source): """ Check a media source as a valid file or uri and return the proper uri """ import gst - # Is this an valid URI source - if gst.uri_is_valid(source): + + src_info = source_info(source) + + if src_info['is_file']: # Is this a file? + return get_uri(src_info['uri']) + + elif gst.uri_is_valid(source): # Is this a valid URI source for Gstreamer uri_protocol = gst.uri_get_protocol(source) if gst.uri_protocol_is_supported(gst.URI_SRC, uri_protocol): return source else: raise IOError('Invalid URI source for Gstreamer') - - # is this a file? - import os.path - if os.path.exists(source): - # get the absolute path - pathname = os.path.abspath(source) - # and make a uri of it - uri = path2uri(pathname) - - return get_uri(uri) else: - raise IOError('Failed getting uri for path %s: not such file or directoy' % source) + raise IOError('Failed getting uri for path %s: no such file' % source) - return uri def get_media_uri_info(uri): @@ -151,6 +161,73 @@ def stack(process_func): return wrapper +def get_sha1(source): + src_info = source_info(source) + + if src_info['is_file']: # Is this a file? + return sha1sum_file(src_info['pathname']) + else: # Then it should be an url + return sha1sum_url(source) + + +def sha1sum_file(filename): + ''' + Return the secure hash digest with sha1 algorithm for a given file + + >>> print sha1sum_file('../../tests/samples/guitar.wav') + 08301c3f9a8d60926f31e253825cc74263e52ad1 + ''' + import hashlib + import io + + sha1 = hashlib.sha1() + chunk_size = sha1.block_size * io.DEFAULT_BUFFER_SIZE + + with open(filename, 'rb') as f: + for chunk in iter(lambda: f.read(chunk_size), b''): + sha1.update(chunk) + return sha1.hexdigest() + + +def sha1sum_url(url): + '''Return the secure hash digest with sha1 algorithm for a given url + + >>> url = 'https://github.com/yomguy/timeside-samples/raw/master/samples/guitar.wav' + >>> print sha1sum_url(url) + 08301c3f9a8d60926f31e253825cc74263e52ad1 + >>> uri = get_uri('../../tests/samples/guitar.wav') + >>> print sha1sum_url(uri) + 08301c3f9a8d60926f31e253825cc74263e52ad1 + + ''' + import hashlib + import urllib + from contextlib import closing + + sha1 = hashlib.sha1() + chunk_size = sha1.block_size * 8192 + + max_file_size = 10*1024*1024 # 10Mo limit in case of very large file + + total_read = 0 + with closing(urllib.urlopen(url)) as url_obj: + for chunk in iter(lambda: url_obj.read(chunk_size), b''): + sha1.update(chunk) + total_read += chunk_size + if total_read > max_file_size: + break + + return sha1.hexdigest() + + +def sha1sum_numpy(np_array): + ''' + Return the secure hash digest with sha1 algorithm for a numpy array + ''' + import hashlib + return hashlib.sha1(np_array.view(np.uint8)).hexdigest() + + if __name__ == "__main__": import doctest doctest.testmod() -- 2.39.5