From a20e910b9830a6453e8181b7a9e312ffbf90cb59 Mon Sep 17 00:00:00 2001 From: Guillaume Pellerin Date: Tue, 19 Aug 2025 11:05:00 +0200 Subject: [PATCH] add whisper transcription --- bin/mastering/mastering.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/bin/mastering/mastering.py b/bin/mastering/mastering.py index 63862c2..ae3b3b1 100755 --- a/bin/mastering/mastering.py +++ b/bin/mastering/mastering.py @@ -6,14 +6,10 @@ import psutil import logging import datetime import argparse -import soundfile -import librosa -import numpy as np -from scipy import signal class Logger: - """A logging object""" + """A file logger""" def __init__(self, file): self.logger = logging.getLogger('myapp') @@ -30,13 +26,14 @@ class TeleCasterMastering(object): raw_source_formats = ['webm',] source_formats = ['webm', 'mp4'] dest_formats = { - 'mp3' : {'in': '', 'out': '-vn -acodec libmp3lame -aq 6'}, + # 'mp3' : {'in': '', 'out': '-vn -acodec libmp3lame -aq 6'}, 'mp4' : {'in': '', 'out': '-c:v libx264-c:a aac -b:a 96k'}, 'jpg' : {'in': '-ss 0:0:10', 'out': '-frames:v 1 -y'} } nvidia_formats = {'mp4': {'in': '', 'out': '-c:v h264_nvenc -maxrate 1100k -c:a aac -b:a 128k'}} vaapi_formats = {'mp4': {'in': '-hwaccel vaapi -hwaccel_device /dev/dri/renderD128 -hwaccel_output_format vaapi', 'out': '-c:v h264_vaapi -c:a aac -b:a 96k'}} + transcription_format = "vtt" date_limit = datetime.datetime(year=2024, month=4, day=19) tmp_dir = "/tmp/" @@ -67,6 +64,7 @@ class TeleCasterMastering(object): self.verbose_mode = args.verbose self.remux_only_mode = args.remux_only self.date_filter = args.date_filter + self.transcribe_mode = args.transcribe if args.input_formats: self.input_formats = args.input_formats @@ -89,6 +87,9 @@ class TeleCasterMastering(object): return extension in exts def get_offset(self, within_file, find_file, window=10): + import librosa + import numpy as np + from scipy import signal y_within, sr_within = librosa.load(within_file, sr=None, duration=60.0) y_find, _ = librosa.load(find_file, sr=sr_within, duration=60.0) c = signal.correlate(y_within, y_find[:sr_within*window], mode='valid', method='fft') @@ -165,6 +166,17 @@ class TeleCasterMastering(object): self.touch(log) + def transcribe(self, file): + import whisper + filename, ext = os.path.splitext(file) + output_dir = os.path.dirname(file) + model = whisper.load_model("turbo") + self.logger.logger.info("Transcription started...") + result = model.transcribe(file, language="fr", verbose=self.verbose_mode) + output_writer = whisper.utils.get_writer(self.transcription_format, output_dir) + output_writer(result, filename) + self.logger.logger.info("Transcription written...") + def is_processed(self, source_files): processed = False for file in source_files: @@ -221,6 +233,8 @@ class TeleCasterMastering(object): if not file in offsets: offset = offsets[1]['offsets'][file] self.transcode(file, offset=offset) + if ext == "webm" and self.transcribe_mode: + self.transcribe(file) def main(): @@ -237,6 +251,7 @@ def main(): parser.add_argument('-r', '--remux_only', help='remux only mode', action="store_true") parser.add_argument('-i','--input_formats', nargs='+', help='Required input formats') parser.add_argument('-o','--output_formats', nargs='+', help='Required output formats') + parser.add_argument('-tr', '--transcribe', help='transcribe audio to vtt', action="store_true") args = parser.parse_args() t = TeleCasterMastering(args) -- 2.39.5