first
0 parents
Showing
12 changed files
with
848 additions
and
0 deletions
funclip/__init__.py
0 → 100644
File mode changed
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
funclip/argparse_tools.py
0 → 100644
| 1 | import argparse | ||
| 2 | from pathlib import Path | ||
| 3 | |||
| 4 | import yaml | ||
| 5 | import sys | ||
| 6 | |||
| 7 | |||
| 8 | class ArgumentParser(argparse.ArgumentParser): | ||
| 9 | """Simple implementation of ArgumentParser supporting config file | ||
| 10 | |||
| 11 | This class is originated from https://github.com/bw2/ConfigArgParse, | ||
| 12 | but this class is lack of some features that it has. | ||
| 13 | |||
| 14 | - Not supporting multiple config files | ||
| 15 | - Automatically adding "--config" as an option. | ||
| 16 | - Not supporting any formats other than yaml | ||
| 17 | - Not checking argument type | ||
| 18 | |||
| 19 | """ | ||
| 20 | |||
| 21 | def __init__(self, *args, **kwargs): | ||
| 22 | super().__init__(*args, **kwargs) | ||
| 23 | self.add_argument("--config", help="Give config file in yaml format") | ||
| 24 | |||
| 25 | def parse_known_args(self, args=None, namespace=None): | ||
| 26 | # Once parsing for setting from "--config" | ||
| 27 | _args, _ = super().parse_known_args(args, namespace) | ||
| 28 | if _args.config is not None: | ||
| 29 | if not Path(_args.config).exists(): | ||
| 30 | self.error(f"No such file: {_args.config}") | ||
| 31 | |||
| 32 | with open(_args.config, "r", encoding="utf-8") as f: | ||
| 33 | d = yaml.safe_load(f) | ||
| 34 | if not isinstance(d, dict): | ||
| 35 | self.error("Config file has non dict value: {_args.config}") | ||
| 36 | |||
| 37 | for key in d: | ||
| 38 | for action in self._actions: | ||
| 39 | if key == action.dest: | ||
| 40 | break | ||
| 41 | else: | ||
| 42 | self.error(f"unrecognized arguments: {key} (from {_args.config})") | ||
| 43 | |||
| 44 | # NOTE(kamo): Ignore "--config" from a config file | ||
| 45 | # NOTE(kamo): Unlike "configargparse", this module doesn't check type. | ||
| 46 | # i.e. We can set any type value regardless of argument type. | ||
| 47 | self.set_defaults(**d) | ||
| 48 | return super().parse_known_args(args, namespace) | ||
| 49 | |||
| 50 | |||
| 51 | def get_commandline_args(): | ||
| 52 | extra_chars = [ | ||
| 53 | " ", | ||
| 54 | ";", | ||
| 55 | "&", | ||
| 56 | "(", | ||
| 57 | ")", | ||
| 58 | "|", | ||
| 59 | "^", | ||
| 60 | "<", | ||
| 61 | ">", | ||
| 62 | "?", | ||
| 63 | "*", | ||
| 64 | "[", | ||
| 65 | "]", | ||
| 66 | "$", | ||
| 67 | "`", | ||
| 68 | '"', | ||
| 69 | "\\", | ||
| 70 | "!", | ||
| 71 | "{", | ||
| 72 | "}", | ||
| 73 | ] | ||
| 74 | |||
| 75 | # Escape the extra characters for shell | ||
| 76 | argv = [ | ||
| 77 | arg.replace("'", "'\\''") | ||
| 78 | if all(char not in arg for char in extra_chars) | ||
| 79 | else "'" + arg.replace("'", "'\\''") + "'" | ||
| 80 | for arg in sys.argv | ||
| 81 | ] | ||
| 82 | |||
| 83 | return sys.executable + " " + " ".join(argv) | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
funclip/imagemagick_test.py
0 → 100644
| 1 | from moviepy.editor import * | ||
| 2 | from moviepy.video.tools.subtitles import SubtitlesClip | ||
| 3 | |||
| 4 | generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=48, color='white') | ||
| 5 | subs = [((0, 2), 'sub1中文字幕'), | ||
| 6 | ((2, 4), 'subs2'), | ||
| 7 | ((4, 6), 'subs3'), | ||
| 8 | ((6, 8), 'subs4')] | ||
| 9 | |||
| 10 | subtitles = SubtitlesClip(subs, generator) | ||
| 11 | |||
| 12 | video = VideoFileClip("examples/2022云栖大会_片段.mp4.mp4") | ||
| 13 | video = video.subclip(0, 8) | ||
| 14 | video = CompositeVideoClip([video, subtitles.set_pos(('center','bottom'))]) | ||
| 15 | |||
| 16 | video.write_videofile("test_output.mp4") | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
funclip/launch.py
0 → 100644
| 1 | import gradio as gr | ||
| 2 | from funasr import AutoModel | ||
| 3 | from videoclipper import VideoClipper | ||
| 4 | |||
| 5 | if __name__ == "__main__": | ||
| 6 | funasr_model = AutoModel(model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", | ||
| 7 | model_revision="v2.0.4", | ||
| 8 | vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", | ||
| 9 | vad_model_revision="v2.0.4", | ||
| 10 | punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", | ||
| 11 | punc_model_revision="v2.0.4", | ||
| 12 | spk_model="damo/speech_campplus_sv_zh-cn_16k-common", | ||
| 13 | spk_model_revision="v2.0.2", | ||
| 14 | ) | ||
| 15 | audio_clipper = VideoClipper(funasr_model) | ||
| 16 | |||
| 17 | |||
| 18 | def audio_recog(audio_input, sd_switch, hotwords): | ||
| 19 | # import pdb; pdb.set_trace() | ||
| 20 | print(audio_input) | ||
| 21 | return audio_clipper.recog(audio_input, sd_switch, hotwords=hotwords) | ||
| 22 | |||
| 23 | |||
| 24 | def audio_clip(dest_text, audio_spk_input, start_ost, end_ost, state): | ||
| 25 | return audio_clipper.clip(dest_text, start_ost, end_ost, state, dest_spk=audio_spk_input) | ||
| 26 | |||
| 27 | |||
| 28 | def video_recog(video_input, sd_switch, hotwords): | ||
| 29 | return audio_clipper.video_recog(video_input, sd_switch, hotwords) | ||
| 30 | |||
| 31 | |||
| 32 | def video_clip(dest_text, video_spk_input, start_ost, end_ost, state): | ||
| 33 | return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, dest_spk=video_spk_input) | ||
| 34 | |||
| 35 | |||
| 36 | def video_clip_addsub(dest_text, video_spk_input, start_ost, end_ost, state, font_size, font_color): | ||
| 37 | return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, font_size, font_color, add_sub=True, | ||
| 38 | dest_spk=video_spk_input) | ||
| 39 | |||
| 40 | |||
| 41 | top_md_1 = (""" | ||
| 42 | **<font color="#1785c4"></font>** | ||
| 43 | <div align="center"> | ||
| 44 | <div style="display:flex; gap: 0.25rem;" align="center"> | ||
| 45 | </div> | ||
| 46 | </div> | ||
| 47 | """) | ||
| 48 | |||
| 49 | top_md_2 = (""" | ||
| 50 | <div align="center"> | ||
| 51 | <div style="display:flex; gap: 0.25rem;" align="center"> | ||
| 52 | </div> | ||
| 53 | </div> | ||
| 54 | """) | ||
| 55 | |||
| 56 | top_md_3 = (""" | ||
| 57 | * Step1: 上传视频或音频文件(或使用下方的用例体验),点击 **<font color="#f7802b">识别</font>** 按钮 | ||
| 58 | * Step2: 复制识别结果中所需的文字至右上方,或者右设置说话人标识,设置偏移与字幕配置(可选) | ||
| 59 | * Step3: 点击 **<font color="#f7802b">裁剪</font>** 按钮或 **<font color="#f7802b">裁剪并添加字幕</font>** 按钮获得结果 | ||
| 60 | """) | ||
| 61 | |||
| 62 | # gradio interface | ||
| 63 | with gr.Blocks() as demo: | ||
| 64 | # gr.Image("./examples/guide.png", show_label=False) | ||
| 65 | gr.Markdown(top_md_1) | ||
| 66 | gr.Markdown(top_md_2) | ||
| 67 | gr.Markdown(top_md_3) | ||
| 68 | video_state = gr.State() | ||
| 69 | audio_state = gr.State() | ||
| 70 | with gr.Tab("🎥✂️视频裁剪 Video Clipping"): | ||
| 71 | with gr.Row(): | ||
| 72 | with gr.Column(): | ||
| 73 | video_input = gr.Video(label="🎥视频输入 Video Input") | ||
| 74 | with gr.Row(): | ||
| 75 | video_sd_switch = gr.Radio(["no", "yes"], label="👥是否区分说话人 Recognize Speakers", | ||
| 76 | value='no') | ||
| 77 | hotwords_input = gr.Textbox(label="🚒热词 Hotwords") | ||
| 78 | recog_button2 = gr.Button("👂识别 Recognize") | ||
| 79 | video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result") | ||
| 80 | video_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles") | ||
| 81 | with gr.Column(): | ||
| 82 | video_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)") | ||
| 83 | video_spk_input = gr.Textbox(label="✏️待裁剪说话人 Speaker to Clip (多个说话人使用'#'连接)") | ||
| 84 | with gr.Row(): | ||
| 85 | video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, | ||
| 86 | label="⏪开始位置偏移 Start Offset (ms)") | ||
| 87 | video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, | ||
| 88 | label="⏩结束位置偏移 End Offset (ms)") | ||
| 89 | with gr.Row(): | ||
| 90 | font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, | ||
| 91 | label="🔠字幕字体大小 Subtitle Font Size") | ||
| 92 | font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color", | ||
| 93 | value='white') | ||
| 94 | # font = gr.Radio(["黑体", "Alibaba Sans"], label="字体 Font") | ||
| 95 | with gr.Row(): | ||
| 96 | clip_button2 = gr.Button("✂️裁剪\nClip") | ||
| 97 | clip_button3 = gr.Button("✂️裁剪并添加字幕\nClip and Generate Subtitles") | ||
| 98 | video_output = gr.Video(label="🎥裁剪结果 Audio Clipped") | ||
| 99 | video_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log") | ||
| 100 | video_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles") | ||
| 101 | |||
| 102 | with gr.Tab("🔊✂️音频裁剪 Audio Clipping"): | ||
| 103 | with gr.Row(): | ||
| 104 | with gr.Column(): | ||
| 105 | audio_input = gr.Audio(label="🔊音频输入 Audio Input") | ||
| 106 | with gr.Row(): | ||
| 107 | audio_sd_switch = gr.Radio(["no", "yes"], label="👥是否区分说话人 Recognize Speakers", | ||
| 108 | value='no') | ||
| 109 | hotwords_input2 = gr.Textbox(label="🚒热词 Hotwords") | ||
| 110 | recog_button1 = gr.Button("👂识别 Recognize") | ||
| 111 | audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result") | ||
| 112 | audio_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles") | ||
| 113 | with gr.Column(): | ||
| 114 | audio_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)") | ||
| 115 | audio_spk_input = gr.Textbox(label="✏️待裁剪说话人 Speaker to Clip (多个说话人使用'#'连接)") | ||
| 116 | with gr.Row(): | ||
| 117 | audio_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, | ||
| 118 | label="⏪开始位置偏移 Start Offset (ms)") | ||
| 119 | audio_end_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, | ||
| 120 | label="⏩结束位置偏移 End Offset (ms)") | ||
| 121 | with gr.Row(): | ||
| 122 | clip_button1 = gr.Button("✂️裁剪 Clip") | ||
| 123 | audio_output = gr.Audio(label="🔊裁剪结果 Audio Clipped") | ||
| 124 | audio_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log") | ||
| 125 | audio_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles") | ||
| 126 | |||
| 127 | recog_button1.click(audio_recog, | ||
| 128 | inputs=[audio_input, audio_sd_switch, hotwords_input2], | ||
| 129 | outputs=[audio_text_output, audio_srt_output, audio_state]) | ||
| 130 | clip_button1.click(audio_clip, | ||
| 131 | inputs=[audio_text_input, audio_spk_input, audio_start_ost, audio_end_ost, audio_state], | ||
| 132 | outputs=[audio_output, audio_mess_output, audio_srt_clip_output]) | ||
| 133 | |||
| 134 | recog_button2.click(video_recog, | ||
| 135 | inputs=[video_input, video_sd_switch, hotwords_input], | ||
| 136 | outputs=[video_text_output, video_srt_output, video_state]) | ||
| 137 | clip_button2.click(video_clip, | ||
| 138 | inputs=[video_text_input, video_spk_input, video_start_ost, video_end_ost, video_state], | ||
| 139 | outputs=[video_output, video_mess_output, video_srt_clip_output]) | ||
| 140 | clip_button3.click(video_clip_addsub, | ||
| 141 | inputs=[video_text_input, video_spk_input, video_start_ost, video_end_ost, video_state, | ||
| 142 | font_size, font_color], | ||
| 143 | outputs=[video_output, video_mess_output, video_srt_clip_output]) | ||
| 144 | |||
| 145 | # start gradio service in local | ||
| 146 | demo.launch() |
funclip/subtitle_utils.py
0 → 100644
| 1 | def time_convert(ms): | ||
| 2 | ms = int(ms) | ||
| 3 | tail = ms % 1000 | ||
| 4 | s = ms // 1000 | ||
| 5 | mi = s // 60 | ||
| 6 | s = s % 60 | ||
| 7 | h = mi // 60 | ||
| 8 | mi = mi % 60 | ||
| 9 | h = "00" if h == 0 else str(h) | ||
| 10 | mi = "00" if mi == 0 else str(mi) | ||
| 11 | s = "00" if s == 0 else str(s) | ||
| 12 | tail = str(tail) | ||
| 13 | if len(h) == 1: h = '0' + h | ||
| 14 | if len(mi) == 1: mi = '0' + mi | ||
| 15 | if len(s) == 1: s = '0' + s | ||
| 16 | return "{}:{}:{},{}".format(h, mi, s, tail) | ||
| 17 | |||
| 18 | |||
| 19 | class Text2SRT(): | ||
| 20 | def __init__(self, text, timestamp, offset=0): | ||
| 21 | self.token_list = [i for i in text.split() if len(i)] | ||
| 22 | self.timestamp = timestamp | ||
| 23 | start, end = timestamp[0][0] - offset, timestamp[-1][1] - offset | ||
| 24 | self.start_sec, self.end_sec = start, end | ||
| 25 | self.start_time = time_convert(start) | ||
| 26 | self.end_time = time_convert(end) | ||
| 27 | def text(self): | ||
| 28 | res = "" | ||
| 29 | for word in self.token_list: | ||
| 30 | if '\u4e00' <= word <= '\u9fff': | ||
| 31 | res += word | ||
| 32 | else: | ||
| 33 | res += " " + word | ||
| 34 | return res | ||
| 35 | def len(self): | ||
| 36 | return len(self.token_list) | ||
| 37 | def srt(self, acc_ost=0.0): | ||
| 38 | return "{} --> {}\n{}\n".format( | ||
| 39 | time_convert(self.start_sec+acc_ost*1000), | ||
| 40 | time_convert(self.end_sec+acc_ost*1000), | ||
| 41 | self.text()) | ||
| 42 | def time(self, acc_ost=0.0): | ||
| 43 | return (self.start_sec/1000+acc_ost, self.end_sec/1000+acc_ost) | ||
| 44 | |||
| 45 | |||
| 46 | def generate_srt(sentence_list): | ||
| 47 | srt_total = '' | ||
| 48 | for i, d in enumerate(sentence_list): | ||
| 49 | t2s = Text2SRT(d['text'], d['timestamp']) | ||
| 50 | if 'spk' in d: | ||
| 51 | srt_total += "{} spk{}\n{}".format(i, d['spk'], t2s.srt()) | ||
| 52 | else: | ||
| 53 | srt_total += "{}\n{}".format(i, t2s.srt()) | ||
| 54 | return srt_total | ||
| 55 | |||
| 56 | def generate_srt_clip(sentence_list, start, end, begin_index=0, time_acc_ost=0.0): | ||
| 57 | start, end = int(start * 1000), int(end * 1000) | ||
| 58 | srt_total = '' | ||
| 59 | cc = 1 + begin_index | ||
| 60 | subs = [] | ||
| 61 | for i, d in enumerate(sentence_list): | ||
| 62 | if d['timestamp'][-1][1] <= start: | ||
| 63 | continue | ||
| 64 | if d['timestamp'][0][0] >= end: | ||
| 65 | break | ||
| 66 | # parts in between | ||
| 67 | if (d['timestamp'][-1][1] <= end and d['timestamp'][0][0] > start) or (d['timestamp'][-1][1] == end and d['timestamp'][0][0] == start): | ||
| 68 | t2s = Text2SRT(d['text'], d['timestamp'], offset=start) | ||
| 69 | srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost)) | ||
| 70 | subs.append((t2s.time(time_acc_ost), t2s.text())) | ||
| 71 | cc += 1 | ||
| 72 | continue | ||
| 73 | if d['timestamp'][0][0] <= start: | ||
| 74 | if not d['timestamp'][-1][1] > end: | ||
| 75 | for j, ts in enumerate(d['timestamp']): | ||
| 76 | if ts[1] > start: | ||
| 77 | break | ||
| 78 | _text = " ".join(d['text'].split()[j:]) | ||
| 79 | _ts = d['timestamp'][j:] | ||
| 80 | else: | ||
| 81 | for j, ts in enumerate(d['timestamp']): | ||
| 82 | if ts[1] > start: | ||
| 83 | _start = j | ||
| 84 | break | ||
| 85 | for j, ts in enumerate(d['timestamp']): | ||
| 86 | if ts[1] > end: | ||
| 87 | _end = j | ||
| 88 | break | ||
| 89 | _text = " ".join(d['text'].split()[_start:_end]) | ||
| 90 | _ts = d['timestamp'][_start:_end] | ||
| 91 | if len(ts): | ||
| 92 | t2s = Text2SRT(_text, _ts, offset=start) | ||
| 93 | srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost)) | ||
| 94 | subs.append((t2s.time(time_acc_ost), t2s.text())) | ||
| 95 | cc += 1 | ||
| 96 | continue | ||
| 97 | if d['timestamp'][-1][1] > end: | ||
| 98 | for j, ts in enumerate(d['timestamp']): | ||
| 99 | if ts[1] > end: | ||
| 100 | break | ||
| 101 | _text = " ".join(d['text'].split()[:j]) | ||
| 102 | _ts = d['timestamp'][:j] | ||
| 103 | if len(_ts): | ||
| 104 | t2s = Text2SRT(_text, _ts, offset=start) | ||
| 105 | srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost)) | ||
| 106 | subs.append( | ||
| 107 | (t2s.time(time_acc_ost), t2s.text()) | ||
| 108 | ) | ||
| 109 | cc += 1 | ||
| 110 | continue | ||
| 111 | return srt_total, subs, cc |
funclip/test.sh
0 → 100644
| 1 | # step1: Recognize | ||
| 2 | python videoclipper.py --stage 1 \ | ||
| 3 | --file ../examples/2022云栖大会_片段.mp4 \ | ||
| 4 | --sd_switch yes \ | ||
| 5 | --output_dir ./output | ||
| 6 | # now you can find recognition results and entire SRT file in ./output/ | ||
| 7 | # step2: Clip | ||
| 8 | python videoclipper.py --stage 2 \ | ||
| 9 | --file ../examples/2022云栖大会_片段.mp4 \ | ||
| 10 | --output_dir ./output \ | ||
| 11 | --dest_text '所以这个是我们办这个奖的初心啊,我们也会一届一届的办下去' \ | ||
| 12 | # --dest_spk spk0 \ | ||
| 13 | --start_ost 0 \ | ||
| 14 | --end_ost 100 \ | ||
| 15 | --output_file './output/res.mp4' | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
funclip/trans_utils.py
0 → 100644
| 1 | PUNC_LIST = [',', '。', '!', '?', '、'] | ||
| 2 | |||
| 3 | |||
| 4 | def pre_proc(text): | ||
| 5 | res = '' | ||
| 6 | for i in range(len(text)): | ||
| 7 | if text[i] in PUNC_LIST: | ||
| 8 | continue | ||
| 9 | if '\u4e00' <= text[i] <= '\u9fff': | ||
| 10 | if len(res) and res[-1] != " ": | ||
| 11 | res += ' ' + text[i]+' ' | ||
| 12 | else: | ||
| 13 | res += text[i]+' ' | ||
| 14 | else: | ||
| 15 | res += text[i] | ||
| 16 | if res[-1] == ' ': | ||
| 17 | res = res[:-1] | ||
| 18 | return res | ||
| 19 | |||
| 20 | def proc(raw_text, timestamp, dest_text): | ||
| 21 | # simple matching | ||
| 22 | ld = len(dest_text.split()) | ||
| 23 | mi, ts = [], [] | ||
| 24 | offset = 0 | ||
| 25 | while True: | ||
| 26 | fi = raw_text.find(dest_text, offset, len(raw_text)) | ||
| 27 | # import pdb; pdb.set_trace() | ||
| 28 | ti = raw_text[:fi].count(' ') | ||
| 29 | if fi == -1: | ||
| 30 | break | ||
| 31 | offset = fi + ld | ||
| 32 | mi.append(fi) | ||
| 33 | ts.append([timestamp[ti][0]*16, timestamp[ti+ld-1][1]*16]) | ||
| 34 | # import pdb; pdb.set_trace() | ||
| 35 | return ts | ||
| 36 | |||
| 37 | def proc_spk(dest_spk, sd_sentences): | ||
| 38 | ts = [] | ||
| 39 | for d in sd_sentences: | ||
| 40 | d_start = d['timestamp'][0][0] | ||
| 41 | d_end = d['timestamp'][-1][1] | ||
| 42 | spkid=dest_spk[3:] | ||
| 43 | # import pdb; pdb.set_trace() | ||
| 44 | if str(d['spk']) == spkid and d_end-d_start>999: | ||
| 45 | ts.append([d['start']*16, d['end']*16]) | ||
| 46 | return ts | ||
| 47 | |||
| 48 | def generate_vad_data(data, sd_sentences, sr=16000): | ||
| 49 | assert len(data.shape) == 1 | ||
| 50 | vad_data = [] | ||
| 51 | for d in sd_sentences: | ||
| 52 | d_start = round(d['ts_list'][0][0]/1000, 2) | ||
| 53 | d_end = round(d['ts_list'][-1][1]/1000, 2) | ||
| 54 | vad_data.append([d_start, d_end, data[int(d_start * sr):int(d_end * sr)]]) | ||
| 55 | return vad_data | ||
| 56 | |||
| 57 | def write_state(output_dir, state): | ||
| 58 | for key in ['/recog_res_raw', '/timestamp', '/sentences']:#, '/sd_sentences']: | ||
| 59 | with open(output_dir+key, 'w') as fout: | ||
| 60 | fout.write(str(state[key[1:]])) | ||
| 61 | if 'sd_sentences' in state: | ||
| 62 | with open(output_dir+'/sd_sentences', 'w') as fout: | ||
| 63 | fout.write(str(state['sd_sentences'])) | ||
| 64 | |||
| 65 | import os | ||
| 66 | def load_state(output_dir): | ||
| 67 | state = {} | ||
| 68 | with open(output_dir+'/recog_res_raw') as fin: | ||
| 69 | line = fin.read() | ||
| 70 | state['recog_res_raw'] = line | ||
| 71 | with open(output_dir+'/timestamp') as fin: | ||
| 72 | line = fin.read() | ||
| 73 | state['timestamp'] = eval(line) | ||
| 74 | with open(output_dir+'/sentences') as fin: | ||
| 75 | line = fin.read() | ||
| 76 | state['sentences'] = eval(line) | ||
| 77 | if os.path.exists(output_dir+'/sd_sentences'): | ||
| 78 | with open(output_dir+'/sd_sentences') as fin: | ||
| 79 | line = fin.read() | ||
| 80 | state['sd_sentences'] = eval(line) | ||
| 81 | return state | ||
| 82 | |||
| 83 | import numpy as np | ||
| 84 | def convert_pcm_to_float(data): | ||
| 85 | if data.dtype == np.float64: | ||
| 86 | return data | ||
| 87 | elif data.dtype == np.float32: | ||
| 88 | return data.astype(np.float64) | ||
| 89 | elif data.dtype == np.int16: | ||
| 90 | bit_depth = 16 | ||
| 91 | elif data.dtype == np.int32: | ||
| 92 | bit_depth = 32 | ||
| 93 | elif data.dtype == np.int8: | ||
| 94 | bit_depth = 8 | ||
| 95 | else: | ||
| 96 | raise ValueError("Unsupported audio data type") | ||
| 97 | |||
| 98 | # Now handle the integer types | ||
| 99 | max_int_value = float(2 ** (bit_depth - 1)) | ||
| 100 | if bit_depth == 8: | ||
| 101 | data = data - 128 | ||
| 102 | return (data.astype(np.float64) / max_int_value) | ||
| 103 | |||
| 104 | |||
| 105 | |||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
funclip/videoclipper.py
0 → 100644
| 1 | import re | ||
| 2 | import os | ||
| 3 | import sys | ||
| 4 | import copy | ||
| 5 | import librosa | ||
| 6 | import logging | ||
| 7 | import argparse | ||
| 8 | import numpy as np | ||
| 9 | import soundfile as sf | ||
| 10 | import moviepy.editor as mpy | ||
| 11 | from moviepy.editor import * | ||
| 12 | from moviepy.video.tools.subtitles import SubtitlesClip | ||
| 13 | from subtitle_utils import generate_srt, generate_srt_clip | ||
| 14 | from argparse_tools import ArgumentParser, get_commandline_args | ||
| 15 | from trans_utils import pre_proc, proc, write_state, load_state, proc_spk, convert_pcm_to_float | ||
| 16 | |||
| 17 | |||
| 18 | class VideoClipper(): | ||
| 19 | def __init__(self, funasr_model): | ||
| 20 | logging.warning("Initializing VideoClipper.") | ||
| 21 | self.funasr_model = funasr_model | ||
| 22 | self.GLOBAL_COUNT = 0 | ||
| 23 | |||
| 24 | def recog(self, audio_input, sd_switch='no', state=None, hotwords=""): | ||
| 25 | if state is None: | ||
| 26 | state = {} | ||
| 27 | sr, data = audio_input | ||
| 28 | |||
| 29 | # Convert to float64 consistently (includes data type checking) | ||
| 30 | data = convert_pcm_to_float(data) | ||
| 31 | |||
| 32 | # assert sr == 16000, "16kHz sample rate required, {} given.".format(sr) | ||
| 33 | if sr != 16000: # resample with librosa | ||
| 34 | data = librosa.resample(data, orig_sr=sr, target_sr=16000) | ||
| 35 | if len(data.shape) == 2: # multi-channel wav input | ||
| 36 | logging.warning("Input wav shape: {}, only first channel reserved.").format(data.shape) | ||
| 37 | data = data[:,0] | ||
| 38 | state['audio_input'] = (sr, data) | ||
| 39 | if sd_switch == 'yes': | ||
| 40 | rec_result = self.funasr_model.generate(data, return_raw_text=True, is_final=True, hotword=hotwords) | ||
| 41 | res_srt = generate_srt(rec_result[0]['sentence_info']) | ||
| 42 | state['sd_sentences'] = rec_result[0]['sentence_info'] | ||
| 43 | else: | ||
| 44 | rec_result = self.funasr_model.generate(data, | ||
| 45 | return_spk_res=False, | ||
| 46 | sentence_timestamp=True, | ||
| 47 | return_raw_text=True, | ||
| 48 | is_final=True, | ||
| 49 | hotword=hotwords) | ||
| 50 | res_srt = generate_srt(rec_result[0]['sentence_info']) | ||
| 51 | state['recog_res_raw'] = rec_result[0]['raw_text'] | ||
| 52 | state['timestamp'] = rec_result[0]['timestamp'] | ||
| 53 | state['sentences'] = rec_result[0]['sentence_info'] | ||
| 54 | res_text = rec_result[0]['text'] | ||
| 55 | return res_text, res_srt, state | ||
| 56 | |||
| 57 | def clip(self, dest_text, start_ost, end_ost, state, dest_spk=None): | ||
| 58 | # get from state | ||
| 59 | audio_input = state['audio_input'] | ||
| 60 | recog_res_raw = state['recog_res_raw'] | ||
| 61 | timestamp = state['timestamp'] | ||
| 62 | sentences = state['sentences'] | ||
| 63 | sr, data = audio_input | ||
| 64 | data = data.astype(np.float64) | ||
| 65 | |||
| 66 | all_ts = [] | ||
| 67 | if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state: | ||
| 68 | for _dest_text in dest_text.split('#'): | ||
| 69 | if '[' in _dest_text: | ||
| 70 | match = re.search(r'\[(\d+),\s*(\d+)\]', _dest_text) | ||
| 71 | if match: | ||
| 72 | offset_b, offset_e = map(int, match.groups()) | ||
| 73 | log_append = "" | ||
| 74 | else: | ||
| 75 | offset_b, offset_e = 0, 0 | ||
| 76 | log_append = "(Bracket detected in dest_text but offset time matching failed)" | ||
| 77 | _dest_text = _dest_text[:_dest_text.find('[')] | ||
| 78 | else: | ||
| 79 | log_append = "" | ||
| 80 | offset_b, offset_e = 0, 0 | ||
| 81 | _dest_text = pre_proc(_dest_text) | ||
| 82 | ts = proc(recog_res_raw, timestamp, _dest_text) | ||
| 83 | for _ts in ts: all_ts.append([_ts[0]+offset_b*16, _ts[1]+offset_e*16]) | ||
| 84 | if len(ts) > 1 and match: | ||
| 85 | log_append += '(offsets detected but No.{} sub-sentence matched to {} periods in audio, \ | ||
| 86 | offsets are applied to all periods)' | ||
| 87 | else: | ||
| 88 | for _dest_spk in dest_spk.split('#'): | ||
| 89 | ts = proc_spk(_dest_spk, state['sd_sentences']) | ||
| 90 | for _ts in ts: all_ts.append(_ts) | ||
| 91 | log_append = "" | ||
| 92 | ts = all_ts | ||
| 93 | # ts.sort() | ||
| 94 | srt_index = 0 | ||
| 95 | clip_srt = "" | ||
| 96 | if len(ts): | ||
| 97 | start, end = ts[0] | ||
| 98 | start = min(max(0, start+start_ost*16), len(data)) | ||
| 99 | end = min(max(0, end+end_ost*16), len(data)) | ||
| 100 | res_audio = data[start:end] | ||
| 101 | start_end_info = "from {} to {}".format(start/16000, end/16000) | ||
| 102 | srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index) | ||
| 103 | clip_srt += srt_clip | ||
| 104 | for _ts in ts[1:]: # multiple sentence input or multiple output matched | ||
| 105 | start, end = _ts | ||
| 106 | start = min(max(0, start+start_ost*16), len(data)) | ||
| 107 | end = min(max(0, end+end_ost*16), len(data)) | ||
| 108 | start_end_info += ", from {} to {}".format(start, end) | ||
| 109 | res_audio = np.concatenate([res_audio, data[start+start_ost*16:end+end_ost*16]], -1) | ||
| 110 | srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index-1) | ||
| 111 | clip_srt += srt_clip | ||
| 112 | if len(ts): | ||
| 113 | message = "{} periods found in the speech: ".format(len(ts)) + start_end_info + log_append | ||
| 114 | else: | ||
| 115 | message = "No period found in the speech, return raw speech. You may check the recognition result and try other destination text." | ||
| 116 | res_audio = data | ||
| 117 | return (sr, res_audio), message, clip_srt | ||
| 118 | |||
| 119 | def video_recog(self, vedio_filename, sd_switch='no', hotwords=""): | ||
| 120 | video = mpy.VideoFileClip(vedio_filename) | ||
| 121 | # Extract the base name, add '_clip.mp4', and 'wav' | ||
| 122 | base_name, _ = os.path.splitext(vedio_filename) | ||
| 123 | clip_video_file = base_name + '_clip.mp4' | ||
| 124 | audio_file = base_name + '.wav' | ||
| 125 | video.audio.write_audiofile(audio_file) | ||
| 126 | wav = librosa.load(audio_file, sr=16000)[0] | ||
| 127 | # delete the audio file after processing | ||
| 128 | if os.path.exists(audio_file): | ||
| 129 | os.remove(audio_file) | ||
| 130 | state = { | ||
| 131 | 'vedio_filename': vedio_filename, | ||
| 132 | 'clip_video_file': clip_video_file, | ||
| 133 | 'video': video, | ||
| 134 | } | ||
| 135 | # res_text, res_srt = self.recog((16000, wav), state) | ||
| 136 | return self.recog((16000, wav), sd_switch, state, hotwords) | ||
| 137 | |||
| 138 | def video_clip(self, dest_text, start_ost, end_ost, state, font_size=32, font_color='white', add_sub=False, dest_spk=None): | ||
| 139 | # get from state | ||
| 140 | recog_res_raw = state['recog_res_raw'] | ||
| 141 | timestamp = state['timestamp'] | ||
| 142 | sentences = state['sentences'] | ||
| 143 | video = state['video'] | ||
| 144 | clip_video_file = state['clip_video_file'] | ||
| 145 | vedio_filename = state['vedio_filename'] | ||
| 146 | |||
| 147 | all_ts = [] | ||
| 148 | srt_index = 0 | ||
| 149 | if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state: | ||
| 150 | for _dest_text in dest_text.split('#'): | ||
| 151 | if '[' in _dest_text: | ||
| 152 | match = re.search(r'\[(\d+),\s*(\d+)\]', _dest_text) | ||
| 153 | if match: | ||
| 154 | offset_b, offset_e = map(int, match.groups()) | ||
| 155 | log_append = "" | ||
| 156 | else: | ||
| 157 | offset_b, offset_e = 0, 0 | ||
| 158 | log_append = "(Bracket detected in dest_text but offset time matching failed)" | ||
| 159 | _dest_text = _dest_text[:_dest_text.find('[')] | ||
| 160 | else: | ||
| 161 | offset_b, offset_e = 0, 0 | ||
| 162 | log_append = "" | ||
| 163 | _dest_text = pre_proc(_dest_text) | ||
| 164 | ts = proc(recog_res_raw, timestamp, _dest_text) | ||
| 165 | for _ts in ts: all_ts.append([_ts[0]+offset_b*16, _ts[1]+offset_e*16]) | ||
| 166 | if len(ts) > 1 and match: | ||
| 167 | log_append += '(offsets detected but No.{} sub-sentence matched to {} periods in audio, \ | ||
| 168 | offsets are applied to all periods)' | ||
| 169 | else: | ||
| 170 | for _dest_spk in dest_spk.split('#'): | ||
| 171 | ts = proc_spk(_dest_spk, state['sd_sentences']) | ||
| 172 | for _ts in ts: all_ts.append(_ts) | ||
| 173 | time_acc_ost = 0.0 | ||
| 174 | ts = all_ts | ||
| 175 | # ts.sort() | ||
| 176 | clip_srt = "" | ||
| 177 | if len(ts): | ||
| 178 | start, end = ts[0][0] / 16000, ts[0][1] / 16000 | ||
| 179 | srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index, time_acc_ost=time_acc_ost) | ||
| 180 | start, end = start+start_ost/1000.0, end+end_ost/1000.0 | ||
| 181 | video_clip = video.subclip(start, end) | ||
| 182 | start_end_info = "from {} to {}".format(start, end) | ||
| 183 | clip_srt += srt_clip | ||
| 184 | if add_sub: | ||
| 185 | generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color) | ||
| 186 | subtitles = SubtitlesClip(subs, generator) | ||
| 187 | video_clip = CompositeVideoClip([video_clip, subtitles.set_pos(('center','bottom'))]) | ||
| 188 | concate_clip = [video_clip] | ||
| 189 | time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0) | ||
| 190 | for _ts in ts[1:]: | ||
| 191 | start, end = _ts[0] / 16000, _ts[1] / 16000 | ||
| 192 | srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index-1, time_acc_ost=time_acc_ost) | ||
| 193 | chi_subs = [] | ||
| 194 | sub_starts = subs[0][0][0] | ||
| 195 | for sub in subs: | ||
| 196 | chi_subs.append(((sub[0][0]-sub_starts, sub[0][1]-sub_starts), sub[1])) | ||
| 197 | start, end = start+start_ost/1000.0, end+end_ost/1000.0 | ||
| 198 | _video_clip = video.subclip(start, end) | ||
| 199 | start_end_info += ", from {} to {}".format(start, end) | ||
| 200 | clip_srt += srt_clip | ||
| 201 | if add_sub: | ||
| 202 | generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color) | ||
| 203 | subtitles = SubtitlesClip(chi_subs, generator) | ||
| 204 | _video_clip = CompositeVideoClip([_video_clip, subtitles.set_pos(('center','bottom'))]) | ||
| 205 | # _video_clip.write_videofile("debug.mp4", audio_codec="aac") | ||
| 206 | concate_clip.append(copy.copy(_video_clip)) | ||
| 207 | time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0) | ||
| 208 | message = "{} periods found in the audio: ".format(len(ts)) + start_end_info | ||
| 209 | logging.warning("Concating...") | ||
| 210 | if len(concate_clip) > 1: | ||
| 211 | video_clip = concatenate_videoclips(concate_clip) | ||
| 212 | clip_video_file = clip_video_file[:-4] + '_no{}.mp4'.format(self.GLOBAL_COUNT) | ||
| 213 | video_clip.write_videofile(clip_video_file, audio_codec="aac", temp_audiofile="video_no{}.mp4".format(self.GLOBAL_COUNT)) | ||
| 214 | self.GLOBAL_COUNT += 1 | ||
| 215 | else: | ||
| 216 | clip_video_file = vedio_filename | ||
| 217 | message = "No period found in the audio, return raw speech. You may check the recognition result and try other destination text." | ||
| 218 | srt_clip = '' | ||
| 219 | return clip_video_file, message, clip_srt | ||
| 220 | |||
| 221 | def get_parser(): | ||
| 222 | parser = ArgumentParser( | ||
| 223 | description="ClipVideo Argument", | ||
| 224 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, | ||
| 225 | ) | ||
| 226 | parser.add_argument( | ||
| 227 | "--stage", | ||
| 228 | type=int, | ||
| 229 | choices=(1, 2), | ||
| 230 | help="Stage, 0 for recognizing and 1 for clipping", | ||
| 231 | required=True | ||
| 232 | ) | ||
| 233 | parser.add_argument( | ||
| 234 | "--file", | ||
| 235 | type=str, | ||
| 236 | default=None, | ||
| 237 | help="Input file path", | ||
| 238 | required=True | ||
| 239 | ) | ||
| 240 | parser.add_argument( | ||
| 241 | "--sd_switch", | ||
| 242 | type=str, | ||
| 243 | choices=("no", "yes"), | ||
| 244 | default="no", | ||
| 245 | help="Turn on the speaker diarization or not", | ||
| 246 | ) | ||
| 247 | parser.add_argument( | ||
| 248 | "--output_dir", | ||
| 249 | type=str, | ||
| 250 | default='./output', | ||
| 251 | help="Output files path", | ||
| 252 | ) | ||
| 253 | parser.add_argument( | ||
| 254 | "--dest_text", | ||
| 255 | type=str, | ||
| 256 | default=None, | ||
| 257 | help="Destination text string for clipping", | ||
| 258 | ) | ||
| 259 | parser.add_argument( | ||
| 260 | "--dest_spk", | ||
| 261 | type=str, | ||
| 262 | default=None, | ||
| 263 | help="Destination spk id for clipping", | ||
| 264 | ) | ||
| 265 | parser.add_argument( | ||
| 266 | "--start_ost", | ||
| 267 | type=int, | ||
| 268 | default=0, | ||
| 269 | help="Offset time in ms at beginning for clipping" | ||
| 270 | ) | ||
| 271 | parser.add_argument( | ||
| 272 | "--end_ost", | ||
| 273 | type=int, | ||
| 274 | default=0, | ||
| 275 | help="Offset time in ms at ending for clipping" | ||
| 276 | ) | ||
| 277 | parser.add_argument( | ||
| 278 | "--output_file", | ||
| 279 | type=str, | ||
| 280 | default=None, | ||
| 281 | help="Output file path" | ||
| 282 | ) | ||
| 283 | return parser | ||
| 284 | |||
| 285 | |||
| 286 | def runner(stage, file, sd_switch, output_dir, dest_text, dest_spk, start_ost, end_ost, output_file, config=None): | ||
| 287 | audio_suffixs = ['.wav','.mp3','.aac','.m4a','.flac'] | ||
| 288 | video_suffixs = ['.mp4','.avi','.mkv','.flv','.mov','.webm','.ts','.mpeg'] | ||
| 289 | _,ext = os.path.splitext(file) | ||
| 290 | if ext.lower() in audio_suffixs: | ||
| 291 | mode = 'audio' | ||
| 292 | elif ext.lower() in video_suffixs: | ||
| 293 | mode = 'video' | ||
| 294 | else: | ||
| 295 | logging.error("Unsupported file format: {}\n\nplease choise one of the following: {}".format(file),audio_suffixs+video_suffixs) | ||
| 296 | sys.exit(1) # exit if the file is not supported | ||
| 297 | while output_dir.endswith('/'): | ||
| 298 | output_dir = output_dir[:-1] | ||
| 299 | if not os.path.exists(output_dir): | ||
| 300 | os.mkdir(output_dir) | ||
| 301 | if stage == 1: | ||
| 302 | from funasr import AutoModel | ||
| 303 | # initialize funasr automodel | ||
| 304 | logging.warning("Initializing modelscope asr pipeline.") | ||
| 305 | funasr_model = AutoModel(model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", | ||
| 306 | model_revision="v2.0.4", | ||
| 307 | vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", | ||
| 308 | vad_model_revision="v2.0.4", | ||
| 309 | punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", | ||
| 310 | punc_model_revision="v2.0.4", | ||
| 311 | spk_model="damo/speech_campplus_sv_zh-cn_16k-common", | ||
| 312 | spk_model_revision="v2.0.2", | ||
| 313 | ) | ||
| 314 | audio_clipper = VideoClipper(funasr_model) | ||
| 315 | if mode == 'audio': | ||
| 316 | logging.warning("Recognizing audio file: {}".format(file)) | ||
| 317 | wav, sr = librosa.load(file, sr=16000) | ||
| 318 | res_text, res_srt, state = audio_clipper.recog((sr, wav), sd_switch) | ||
| 319 | if mode == 'video': | ||
| 320 | logging.warning("Recognizing video file: {}".format(file)) | ||
| 321 | res_text, res_srt, state = audio_clipper.video_recog(file, sd_switch) | ||
| 322 | total_srt_file = output_dir + '/total.srt' | ||
| 323 | with open(total_srt_file, 'w') as fout: | ||
| 324 | fout.write(res_srt) | ||
| 325 | logging.warning("Write total subtitle to {}".format(total_srt_file)) | ||
| 326 | write_state(output_dir, state) | ||
| 327 | logging.warning("Recognition successed. You can copy the text segment from below and use stage 2.") | ||
| 328 | print(res_text) | ||
| 329 | if stage == 2: | ||
| 330 | audio_clipper = VideoClipper(None) | ||
| 331 | if mode == 'audio': | ||
| 332 | state = load_state(output_dir) | ||
| 333 | wav, sr = librosa.load(file, sr=16000) | ||
| 334 | state['audio_input'] = (sr, wav) | ||
| 335 | (sr, audio), message, srt_clip = audio_clipper.clip(dest_text, start_ost, end_ost, state, dest_spk=dest_spk) | ||
| 336 | if output_file is None: | ||
| 337 | output_file = output_dir + '/result.wav' | ||
| 338 | clip_srt_file = output_file[:-3] + 'srt' | ||
| 339 | logging.warning(message) | ||
| 340 | sf.write(output_file, audio, 16000) | ||
| 341 | assert output_file.endswith('.wav'), "output_file must ends with '.wav'" | ||
| 342 | logging.warning("Save clipped wav file to {}".format(output_file)) | ||
| 343 | with open(clip_srt_file, 'w') as fout: | ||
| 344 | fout.write(srt_clip) | ||
| 345 | logging.warning("Write clipped subtitle to {}".format(clip_srt_file)) | ||
| 346 | if mode == 'video': | ||
| 347 | state = load_state(output_dir) | ||
| 348 | state['vedio_filename'] = file | ||
| 349 | if output_file is None: | ||
| 350 | state['clip_video_file'] = file[:-4] + '_clip.mp4' | ||
| 351 | else: | ||
| 352 | state['clip_video_file'] = output_file | ||
| 353 | clip_srt_file = state['clip_video_file'][:-3] + 'srt' | ||
| 354 | state['video'] = mpy.VideoFileClip(file) | ||
| 355 | clip_video_file, message, srt_clip = audio_clipper.video_clip(dest_text, start_ost, end_ost, state, dest_spk=dest_spk) | ||
| 356 | logging.warning("Clipping Log: {}".format(message)) | ||
| 357 | logging.warning("Save clipped mp4 file to {}".format(clip_video_file)) | ||
| 358 | with open(clip_srt_file, 'w') as fout: | ||
| 359 | fout.write(srt_clip) | ||
| 360 | logging.warning("Write clipped subtitle to {}".format(clip_srt_file)) | ||
| 361 | |||
| 362 | |||
| 363 | def main(cmd=None): | ||
| 364 | print(get_commandline_args(), file=sys.stderr) | ||
| 365 | parser = get_parser() | ||
| 366 | args = parser.parse_args(cmd) | ||
| 367 | kwargs = vars(args) | ||
| 368 | runner(**kwargs) | ||
| 369 | |||
| 370 | |||
| 371 | if __name__ == '__main__': | ||
| 372 | main() | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or sign in to post a comment