3de5c1e5 by 籽li

first

0 parents
File mode changed
1 import argparse
2 from pathlib import Path
3
4 import yaml
5 import sys
6
7
8 class ArgumentParser(argparse.ArgumentParser):
9 """Simple implementation of ArgumentParser supporting config file
10
11 This class is originated from https://github.com/bw2/ConfigArgParse,
12 but this class is lack of some features that it has.
13
14 - Not supporting multiple config files
15 - Automatically adding "--config" as an option.
16 - Not supporting any formats other than yaml
17 - Not checking argument type
18
19 """
20
21 def __init__(self, *args, **kwargs):
22 super().__init__(*args, **kwargs)
23 self.add_argument("--config", help="Give config file in yaml format")
24
25 def parse_known_args(self, args=None, namespace=None):
26 # Once parsing for setting from "--config"
27 _args, _ = super().parse_known_args(args, namespace)
28 if _args.config is not None:
29 if not Path(_args.config).exists():
30 self.error(f"No such file: {_args.config}")
31
32 with open(_args.config, "r", encoding="utf-8") as f:
33 d = yaml.safe_load(f)
34 if not isinstance(d, dict):
35 self.error("Config file has non dict value: {_args.config}")
36
37 for key in d:
38 for action in self._actions:
39 if key == action.dest:
40 break
41 else:
42 self.error(f"unrecognized arguments: {key} (from {_args.config})")
43
44 # NOTE(kamo): Ignore "--config" from a config file
45 # NOTE(kamo): Unlike "configargparse", this module doesn't check type.
46 # i.e. We can set any type value regardless of argument type.
47 self.set_defaults(**d)
48 return super().parse_known_args(args, namespace)
49
50
51 def get_commandline_args():
52 extra_chars = [
53 " ",
54 ";",
55 "&",
56 "(",
57 ")",
58 "|",
59 "^",
60 "<",
61 ">",
62 "?",
63 "*",
64 "[",
65 "]",
66 "$",
67 "`",
68 '"',
69 "\\",
70 "!",
71 "{",
72 "}",
73 ]
74
75 # Escape the extra characters for shell
76 argv = [
77 arg.replace("'", "'\\''")
78 if all(char not in arg for char in extra_chars)
79 else "'" + arg.replace("'", "'\\''") + "'"
80 for arg in sys.argv
81 ]
82
83 return sys.executable + " " + " ".join(argv)
...\ No newline at end of file ...\ No newline at end of file
1 from moviepy.editor import *
2 from moviepy.video.tools.subtitles import SubtitlesClip
3
4 generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=48, color='white')
5 subs = [((0, 2), 'sub1中文字幕'),
6 ((2, 4), 'subs2'),
7 ((4, 6), 'subs3'),
8 ((6, 8), 'subs4')]
9
10 subtitles = SubtitlesClip(subs, generator)
11
12 video = VideoFileClip("examples/2022云栖大会_片段.mp4.mp4")
13 video = video.subclip(0, 8)
14 video = CompositeVideoClip([video, subtitles.set_pos(('center','bottom'))])
15
16 video.write_videofile("test_output.mp4")
...\ No newline at end of file ...\ No newline at end of file
1 import gradio as gr
2 from funasr import AutoModel
3 from videoclipper import VideoClipper
4
5 if __name__ == "__main__":
6 funasr_model = AutoModel(model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
7 model_revision="v2.0.4",
8 vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
9 vad_model_revision="v2.0.4",
10 punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
11 punc_model_revision="v2.0.4",
12 spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
13 spk_model_revision="v2.0.2",
14 )
15 audio_clipper = VideoClipper(funasr_model)
16
17
18 def audio_recog(audio_input, sd_switch, hotwords):
19 # import pdb; pdb.set_trace()
20 print(audio_input)
21 return audio_clipper.recog(audio_input, sd_switch, hotwords=hotwords)
22
23
24 def audio_clip(dest_text, audio_spk_input, start_ost, end_ost, state):
25 return audio_clipper.clip(dest_text, start_ost, end_ost, state, dest_spk=audio_spk_input)
26
27
28 def video_recog(video_input, sd_switch, hotwords):
29 return audio_clipper.video_recog(video_input, sd_switch, hotwords)
30
31
32 def video_clip(dest_text, video_spk_input, start_ost, end_ost, state):
33 return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, dest_spk=video_spk_input)
34
35
36 def video_clip_addsub(dest_text, video_spk_input, start_ost, end_ost, state, font_size, font_color):
37 return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, font_size, font_color, add_sub=True,
38 dest_spk=video_spk_input)
39
40
41 top_md_1 = ("""
42 **<font color="#1785c4"></font>**
43 <div align="center">
44 <div style="display:flex; gap: 0.25rem;" align="center">
45 </div>
46 </div>
47 """)
48
49 top_md_2 = ("""
50 <div align="center">
51 <div style="display:flex; gap: 0.25rem;" align="center">
52 </div>
53 </div>
54 """)
55
56 top_md_3 = ("""
57 * Step1: 上传视频或音频文件(或使用下方的用例体验),点击 **<font color="#f7802b">识别</font>** 按钮
58 * Step2: 复制识别结果中所需的文字至右上方,或者右设置说话人标识,设置偏移与字幕配置(可选)
59 * Step3: 点击 **<font color="#f7802b">裁剪</font>** 按钮或 **<font color="#f7802b">裁剪并添加字幕</font>** 按钮获得结果
60 """)
61
62 # gradio interface
63 with gr.Blocks() as demo:
64 # gr.Image("./examples/guide.png", show_label=False)
65 gr.Markdown(top_md_1)
66 gr.Markdown(top_md_2)
67 gr.Markdown(top_md_3)
68 video_state = gr.State()
69 audio_state = gr.State()
70 with gr.Tab("🎥✂️视频裁剪 Video Clipping"):
71 with gr.Row():
72 with gr.Column():
73 video_input = gr.Video(label="🎥视频输入 Video Input")
74 with gr.Row():
75 video_sd_switch = gr.Radio(["no", "yes"], label="👥是否区分说话人 Recognize Speakers",
76 value='no')
77 hotwords_input = gr.Textbox(label="🚒热词 Hotwords")
78 recog_button2 = gr.Button("👂识别 Recognize")
79 video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
80 video_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
81 with gr.Column():
82 video_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
83 video_spk_input = gr.Textbox(label="✏️待裁剪说话人 Speaker to Clip (多个说话人使用'#'连接)")
84 with gr.Row():
85 video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50,
86 label="⏪开始位置偏移 Start Offset (ms)")
87 video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50,
88 label="⏩结束位置偏移 End Offset (ms)")
89 with gr.Row():
90 font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2,
91 label="🔠字幕字体大小 Subtitle Font Size")
92 font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color",
93 value='white')
94 # font = gr.Radio(["黑体", "Alibaba Sans"], label="字体 Font")
95 with gr.Row():
96 clip_button2 = gr.Button("✂️裁剪\nClip")
97 clip_button3 = gr.Button("✂️裁剪并添加字幕\nClip and Generate Subtitles")
98 video_output = gr.Video(label="🎥裁剪结果 Audio Clipped")
99 video_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
100 video_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
101
102 with gr.Tab("🔊✂️音频裁剪 Audio Clipping"):
103 with gr.Row():
104 with gr.Column():
105 audio_input = gr.Audio(label="🔊音频输入 Audio Input")
106 with gr.Row():
107 audio_sd_switch = gr.Radio(["no", "yes"], label="👥是否区分说话人 Recognize Speakers",
108 value='no')
109 hotwords_input2 = gr.Textbox(label="🚒热词 Hotwords")
110 recog_button1 = gr.Button("👂识别 Recognize")
111 audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
112 audio_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
113 with gr.Column():
114 audio_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
115 audio_spk_input = gr.Textbox(label="✏️待裁剪说话人 Speaker to Clip (多个说话人使用'#'连接)")
116 with gr.Row():
117 audio_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50,
118 label="⏪开始位置偏移 Start Offset (ms)")
119 audio_end_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50,
120 label="⏩结束位置偏移 End Offset (ms)")
121 with gr.Row():
122 clip_button1 = gr.Button("✂️裁剪 Clip")
123 audio_output = gr.Audio(label="🔊裁剪结果 Audio Clipped")
124 audio_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
125 audio_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
126
127 recog_button1.click(audio_recog,
128 inputs=[audio_input, audio_sd_switch, hotwords_input2],
129 outputs=[audio_text_output, audio_srt_output, audio_state])
130 clip_button1.click(audio_clip,
131 inputs=[audio_text_input, audio_spk_input, audio_start_ost, audio_end_ost, audio_state],
132 outputs=[audio_output, audio_mess_output, audio_srt_clip_output])
133
134 recog_button2.click(video_recog,
135 inputs=[video_input, video_sd_switch, hotwords_input],
136 outputs=[video_text_output, video_srt_output, video_state])
137 clip_button2.click(video_clip,
138 inputs=[video_text_input, video_spk_input, video_start_ost, video_end_ost, video_state],
139 outputs=[video_output, video_mess_output, video_srt_clip_output])
140 clip_button3.click(video_clip_addsub,
141 inputs=[video_text_input, video_spk_input, video_start_ost, video_end_ost, video_state,
142 font_size, font_color],
143 outputs=[video_output, video_mess_output, video_srt_clip_output])
144
145 # start gradio service in local
146 demo.launch()
1 def time_convert(ms):
2 ms = int(ms)
3 tail = ms % 1000
4 s = ms // 1000
5 mi = s // 60
6 s = s % 60
7 h = mi // 60
8 mi = mi % 60
9 h = "00" if h == 0 else str(h)
10 mi = "00" if mi == 0 else str(mi)
11 s = "00" if s == 0 else str(s)
12 tail = str(tail)
13 if len(h) == 1: h = '0' + h
14 if len(mi) == 1: mi = '0' + mi
15 if len(s) == 1: s = '0' + s
16 return "{}:{}:{},{}".format(h, mi, s, tail)
17
18
19 class Text2SRT():
20 def __init__(self, text, timestamp, offset=0):
21 self.token_list = [i for i in text.split() if len(i)]
22 self.timestamp = timestamp
23 start, end = timestamp[0][0] - offset, timestamp[-1][1] - offset
24 self.start_sec, self.end_sec = start, end
25 self.start_time = time_convert(start)
26 self.end_time = time_convert(end)
27 def text(self):
28 res = ""
29 for word in self.token_list:
30 if '\u4e00' <= word <= '\u9fff':
31 res += word
32 else:
33 res += " " + word
34 return res
35 def len(self):
36 return len(self.token_list)
37 def srt(self, acc_ost=0.0):
38 return "{} --> {}\n{}\n".format(
39 time_convert(self.start_sec+acc_ost*1000),
40 time_convert(self.end_sec+acc_ost*1000),
41 self.text())
42 def time(self, acc_ost=0.0):
43 return (self.start_sec/1000+acc_ost, self.end_sec/1000+acc_ost)
44
45
46 def generate_srt(sentence_list):
47 srt_total = ''
48 for i, d in enumerate(sentence_list):
49 t2s = Text2SRT(d['text'], d['timestamp'])
50 if 'spk' in d:
51 srt_total += "{} spk{}\n{}".format(i, d['spk'], t2s.srt())
52 else:
53 srt_total += "{}\n{}".format(i, t2s.srt())
54 return srt_total
55
56 def generate_srt_clip(sentence_list, start, end, begin_index=0, time_acc_ost=0.0):
57 start, end = int(start * 1000), int(end * 1000)
58 srt_total = ''
59 cc = 1 + begin_index
60 subs = []
61 for i, d in enumerate(sentence_list):
62 if d['timestamp'][-1][1] <= start:
63 continue
64 if d['timestamp'][0][0] >= end:
65 break
66 # parts in between
67 if (d['timestamp'][-1][1] <= end and d['timestamp'][0][0] > start) or (d['timestamp'][-1][1] == end and d['timestamp'][0][0] == start):
68 t2s = Text2SRT(d['text'], d['timestamp'], offset=start)
69 srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
70 subs.append((t2s.time(time_acc_ost), t2s.text()))
71 cc += 1
72 continue
73 if d['timestamp'][0][0] <= start:
74 if not d['timestamp'][-1][1] > end:
75 for j, ts in enumerate(d['timestamp']):
76 if ts[1] > start:
77 break
78 _text = " ".join(d['text'].split()[j:])
79 _ts = d['timestamp'][j:]
80 else:
81 for j, ts in enumerate(d['timestamp']):
82 if ts[1] > start:
83 _start = j
84 break
85 for j, ts in enumerate(d['timestamp']):
86 if ts[1] > end:
87 _end = j
88 break
89 _text = " ".join(d['text'].split()[_start:_end])
90 _ts = d['timestamp'][_start:_end]
91 if len(ts):
92 t2s = Text2SRT(_text, _ts, offset=start)
93 srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
94 subs.append((t2s.time(time_acc_ost), t2s.text()))
95 cc += 1
96 continue
97 if d['timestamp'][-1][1] > end:
98 for j, ts in enumerate(d['timestamp']):
99 if ts[1] > end:
100 break
101 _text = " ".join(d['text'].split()[:j])
102 _ts = d['timestamp'][:j]
103 if len(_ts):
104 t2s = Text2SRT(_text, _ts, offset=start)
105 srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
106 subs.append(
107 (t2s.time(time_acc_ost), t2s.text())
108 )
109 cc += 1
110 continue
111 return srt_total, subs, cc
1 # step1: Recognize
2 python videoclipper.py --stage 1 \
3 --file ../examples/2022云栖大会_片段.mp4 \
4 --sd_switch yes \
5 --output_dir ./output
6 # now you can find recognition results and entire SRT file in ./output/
7 # step2: Clip
8 python videoclipper.py --stage 2 \
9 --file ../examples/2022云栖大会_片段.mp4 \
10 --output_dir ./output \
11 --dest_text '所以这个是我们办这个奖的初心啊,我们也会一届一届的办下去' \
12 # --dest_spk spk0 \
13 --start_ost 0 \
14 --end_ost 100 \
15 --output_file './output/res.mp4'
...\ No newline at end of file ...\ No newline at end of file
1 PUNC_LIST = [',', '。', '!', '?', '、']
2
3
4 def pre_proc(text):
5 res = ''
6 for i in range(len(text)):
7 if text[i] in PUNC_LIST:
8 continue
9 if '\u4e00' <= text[i] <= '\u9fff':
10 if len(res) and res[-1] != " ":
11 res += ' ' + text[i]+' '
12 else:
13 res += text[i]+' '
14 else:
15 res += text[i]
16 if res[-1] == ' ':
17 res = res[:-1]
18 return res
19
20 def proc(raw_text, timestamp, dest_text):
21 # simple matching
22 ld = len(dest_text.split())
23 mi, ts = [], []
24 offset = 0
25 while True:
26 fi = raw_text.find(dest_text, offset, len(raw_text))
27 # import pdb; pdb.set_trace()
28 ti = raw_text[:fi].count(' ')
29 if fi == -1:
30 break
31 offset = fi + ld
32 mi.append(fi)
33 ts.append([timestamp[ti][0]*16, timestamp[ti+ld-1][1]*16])
34 # import pdb; pdb.set_trace()
35 return ts
36
37 def proc_spk(dest_spk, sd_sentences):
38 ts = []
39 for d in sd_sentences:
40 d_start = d['timestamp'][0][0]
41 d_end = d['timestamp'][-1][1]
42 spkid=dest_spk[3:]
43 # import pdb; pdb.set_trace()
44 if str(d['spk']) == spkid and d_end-d_start>999:
45 ts.append([d['start']*16, d['end']*16])
46 return ts
47
48 def generate_vad_data(data, sd_sentences, sr=16000):
49 assert len(data.shape) == 1
50 vad_data = []
51 for d in sd_sentences:
52 d_start = round(d['ts_list'][0][0]/1000, 2)
53 d_end = round(d['ts_list'][-1][1]/1000, 2)
54 vad_data.append([d_start, d_end, data[int(d_start * sr):int(d_end * sr)]])
55 return vad_data
56
57 def write_state(output_dir, state):
58 for key in ['/recog_res_raw', '/timestamp', '/sentences']:#, '/sd_sentences']:
59 with open(output_dir+key, 'w') as fout:
60 fout.write(str(state[key[1:]]))
61 if 'sd_sentences' in state:
62 with open(output_dir+'/sd_sentences', 'w') as fout:
63 fout.write(str(state['sd_sentences']))
64
65 import os
66 def load_state(output_dir):
67 state = {}
68 with open(output_dir+'/recog_res_raw') as fin:
69 line = fin.read()
70 state['recog_res_raw'] = line
71 with open(output_dir+'/timestamp') as fin:
72 line = fin.read()
73 state['timestamp'] = eval(line)
74 with open(output_dir+'/sentences') as fin:
75 line = fin.read()
76 state['sentences'] = eval(line)
77 if os.path.exists(output_dir+'/sd_sentences'):
78 with open(output_dir+'/sd_sentences') as fin:
79 line = fin.read()
80 state['sd_sentences'] = eval(line)
81 return state
82
83 import numpy as np
84 def convert_pcm_to_float(data):
85 if data.dtype == np.float64:
86 return data
87 elif data.dtype == np.float32:
88 return data.astype(np.float64)
89 elif data.dtype == np.int16:
90 bit_depth = 16
91 elif data.dtype == np.int32:
92 bit_depth = 32
93 elif data.dtype == np.int8:
94 bit_depth = 8
95 else:
96 raise ValueError("Unsupported audio data type")
97
98 # Now handle the integer types
99 max_int_value = float(2 ** (bit_depth - 1))
100 if bit_depth == 8:
101 data = data - 128
102 return (data.astype(np.float64) / max_int_value)
103
104
105
...\ No newline at end of file ...\ No newline at end of file
1 import re
2 import os
3 import sys
4 import copy
5 import librosa
6 import logging
7 import argparse
8 import numpy as np
9 import soundfile as sf
10 import moviepy.editor as mpy
11 from moviepy.editor import *
12 from moviepy.video.tools.subtitles import SubtitlesClip
13 from subtitle_utils import generate_srt, generate_srt_clip
14 from argparse_tools import ArgumentParser, get_commandline_args
15 from trans_utils import pre_proc, proc, write_state, load_state, proc_spk, convert_pcm_to_float
16
17
18 class VideoClipper():
19 def __init__(self, funasr_model):
20 logging.warning("Initializing VideoClipper.")
21 self.funasr_model = funasr_model
22 self.GLOBAL_COUNT = 0
23
24 def recog(self, audio_input, sd_switch='no', state=None, hotwords=""):
25 if state is None:
26 state = {}
27 sr, data = audio_input
28
29 # Convert to float64 consistently (includes data type checking)
30 data = convert_pcm_to_float(data)
31
32 # assert sr == 16000, "16kHz sample rate required, {} given.".format(sr)
33 if sr != 16000: # resample with librosa
34 data = librosa.resample(data, orig_sr=sr, target_sr=16000)
35 if len(data.shape) == 2: # multi-channel wav input
36 logging.warning("Input wav shape: {}, only first channel reserved.").format(data.shape)
37 data = data[:,0]
38 state['audio_input'] = (sr, data)
39 if sd_switch == 'yes':
40 rec_result = self.funasr_model.generate(data, return_raw_text=True, is_final=True, hotword=hotwords)
41 res_srt = generate_srt(rec_result[0]['sentence_info'])
42 state['sd_sentences'] = rec_result[0]['sentence_info']
43 else:
44 rec_result = self.funasr_model.generate(data,
45 return_spk_res=False,
46 sentence_timestamp=True,
47 return_raw_text=True,
48 is_final=True,
49 hotword=hotwords)
50 res_srt = generate_srt(rec_result[0]['sentence_info'])
51 state['recog_res_raw'] = rec_result[0]['raw_text']
52 state['timestamp'] = rec_result[0]['timestamp']
53 state['sentences'] = rec_result[0]['sentence_info']
54 res_text = rec_result[0]['text']
55 return res_text, res_srt, state
56
57 def clip(self, dest_text, start_ost, end_ost, state, dest_spk=None):
58 # get from state
59 audio_input = state['audio_input']
60 recog_res_raw = state['recog_res_raw']
61 timestamp = state['timestamp']
62 sentences = state['sentences']
63 sr, data = audio_input
64 data = data.astype(np.float64)
65
66 all_ts = []
67 if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
68 for _dest_text in dest_text.split('#'):
69 if '[' in _dest_text:
70 match = re.search(r'\[(\d+),\s*(\d+)\]', _dest_text)
71 if match:
72 offset_b, offset_e = map(int, match.groups())
73 log_append = ""
74 else:
75 offset_b, offset_e = 0, 0
76 log_append = "(Bracket detected in dest_text but offset time matching failed)"
77 _dest_text = _dest_text[:_dest_text.find('[')]
78 else:
79 log_append = ""
80 offset_b, offset_e = 0, 0
81 _dest_text = pre_proc(_dest_text)
82 ts = proc(recog_res_raw, timestamp, _dest_text)
83 for _ts in ts: all_ts.append([_ts[0]+offset_b*16, _ts[1]+offset_e*16])
84 if len(ts) > 1 and match:
85 log_append += '(offsets detected but No.{} sub-sentence matched to {} periods in audio, \
86 offsets are applied to all periods)'
87 else:
88 for _dest_spk in dest_spk.split('#'):
89 ts = proc_spk(_dest_spk, state['sd_sentences'])
90 for _ts in ts: all_ts.append(_ts)
91 log_append = ""
92 ts = all_ts
93 # ts.sort()
94 srt_index = 0
95 clip_srt = ""
96 if len(ts):
97 start, end = ts[0]
98 start = min(max(0, start+start_ost*16), len(data))
99 end = min(max(0, end+end_ost*16), len(data))
100 res_audio = data[start:end]
101 start_end_info = "from {} to {}".format(start/16000, end/16000)
102 srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index)
103 clip_srt += srt_clip
104 for _ts in ts[1:]: # multiple sentence input or multiple output matched
105 start, end = _ts
106 start = min(max(0, start+start_ost*16), len(data))
107 end = min(max(0, end+end_ost*16), len(data))
108 start_end_info += ", from {} to {}".format(start, end)
109 res_audio = np.concatenate([res_audio, data[start+start_ost*16:end+end_ost*16]], -1)
110 srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index-1)
111 clip_srt += srt_clip
112 if len(ts):
113 message = "{} periods found in the speech: ".format(len(ts)) + start_end_info + log_append
114 else:
115 message = "No period found in the speech, return raw speech. You may check the recognition result and try other destination text."
116 res_audio = data
117 return (sr, res_audio), message, clip_srt
118
119 def video_recog(self, vedio_filename, sd_switch='no', hotwords=""):
120 video = mpy.VideoFileClip(vedio_filename)
121 # Extract the base name, add '_clip.mp4', and 'wav'
122 base_name, _ = os.path.splitext(vedio_filename)
123 clip_video_file = base_name + '_clip.mp4'
124 audio_file = base_name + '.wav'
125 video.audio.write_audiofile(audio_file)
126 wav = librosa.load(audio_file, sr=16000)[0]
127 # delete the audio file after processing
128 if os.path.exists(audio_file):
129 os.remove(audio_file)
130 state = {
131 'vedio_filename': vedio_filename,
132 'clip_video_file': clip_video_file,
133 'video': video,
134 }
135 # res_text, res_srt = self.recog((16000, wav), state)
136 return self.recog((16000, wav), sd_switch, state, hotwords)
137
138 def video_clip(self, dest_text, start_ost, end_ost, state, font_size=32, font_color='white', add_sub=False, dest_spk=None):
139 # get from state
140 recog_res_raw = state['recog_res_raw']
141 timestamp = state['timestamp']
142 sentences = state['sentences']
143 video = state['video']
144 clip_video_file = state['clip_video_file']
145 vedio_filename = state['vedio_filename']
146
147 all_ts = []
148 srt_index = 0
149 if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
150 for _dest_text in dest_text.split('#'):
151 if '[' in _dest_text:
152 match = re.search(r'\[(\d+),\s*(\d+)\]', _dest_text)
153 if match:
154 offset_b, offset_e = map(int, match.groups())
155 log_append = ""
156 else:
157 offset_b, offset_e = 0, 0
158 log_append = "(Bracket detected in dest_text but offset time matching failed)"
159 _dest_text = _dest_text[:_dest_text.find('[')]
160 else:
161 offset_b, offset_e = 0, 0
162 log_append = ""
163 _dest_text = pre_proc(_dest_text)
164 ts = proc(recog_res_raw, timestamp, _dest_text)
165 for _ts in ts: all_ts.append([_ts[0]+offset_b*16, _ts[1]+offset_e*16])
166 if len(ts) > 1 and match:
167 log_append += '(offsets detected but No.{} sub-sentence matched to {} periods in audio, \
168 offsets are applied to all periods)'
169 else:
170 for _dest_spk in dest_spk.split('#'):
171 ts = proc_spk(_dest_spk, state['sd_sentences'])
172 for _ts in ts: all_ts.append(_ts)
173 time_acc_ost = 0.0
174 ts = all_ts
175 # ts.sort()
176 clip_srt = ""
177 if len(ts):
178 start, end = ts[0][0] / 16000, ts[0][1] / 16000
179 srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index, time_acc_ost=time_acc_ost)
180 start, end = start+start_ost/1000.0, end+end_ost/1000.0
181 video_clip = video.subclip(start, end)
182 start_end_info = "from {} to {}".format(start, end)
183 clip_srt += srt_clip
184 if add_sub:
185 generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
186 subtitles = SubtitlesClip(subs, generator)
187 video_clip = CompositeVideoClip([video_clip, subtitles.set_pos(('center','bottom'))])
188 concate_clip = [video_clip]
189 time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
190 for _ts in ts[1:]:
191 start, end = _ts[0] / 16000, _ts[1] / 16000
192 srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index-1, time_acc_ost=time_acc_ost)
193 chi_subs = []
194 sub_starts = subs[0][0][0]
195 for sub in subs:
196 chi_subs.append(((sub[0][0]-sub_starts, sub[0][1]-sub_starts), sub[1]))
197 start, end = start+start_ost/1000.0, end+end_ost/1000.0
198 _video_clip = video.subclip(start, end)
199 start_end_info += ", from {} to {}".format(start, end)
200 clip_srt += srt_clip
201 if add_sub:
202 generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
203 subtitles = SubtitlesClip(chi_subs, generator)
204 _video_clip = CompositeVideoClip([_video_clip, subtitles.set_pos(('center','bottom'))])
205 # _video_clip.write_videofile("debug.mp4", audio_codec="aac")
206 concate_clip.append(copy.copy(_video_clip))
207 time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
208 message = "{} periods found in the audio: ".format(len(ts)) + start_end_info
209 logging.warning("Concating...")
210 if len(concate_clip) > 1:
211 video_clip = concatenate_videoclips(concate_clip)
212 clip_video_file = clip_video_file[:-4] + '_no{}.mp4'.format(self.GLOBAL_COUNT)
213 video_clip.write_videofile(clip_video_file, audio_codec="aac", temp_audiofile="video_no{}.mp4".format(self.GLOBAL_COUNT))
214 self.GLOBAL_COUNT += 1
215 else:
216 clip_video_file = vedio_filename
217 message = "No period found in the audio, return raw speech. You may check the recognition result and try other destination text."
218 srt_clip = ''
219 return clip_video_file, message, clip_srt
220
221 def get_parser():
222 parser = ArgumentParser(
223 description="ClipVideo Argument",
224 formatter_class=argparse.ArgumentDefaultsHelpFormatter,
225 )
226 parser.add_argument(
227 "--stage",
228 type=int,
229 choices=(1, 2),
230 help="Stage, 0 for recognizing and 1 for clipping",
231 required=True
232 )
233 parser.add_argument(
234 "--file",
235 type=str,
236 default=None,
237 help="Input file path",
238 required=True
239 )
240 parser.add_argument(
241 "--sd_switch",
242 type=str,
243 choices=("no", "yes"),
244 default="no",
245 help="Turn on the speaker diarization or not",
246 )
247 parser.add_argument(
248 "--output_dir",
249 type=str,
250 default='./output',
251 help="Output files path",
252 )
253 parser.add_argument(
254 "--dest_text",
255 type=str,
256 default=None,
257 help="Destination text string for clipping",
258 )
259 parser.add_argument(
260 "--dest_spk",
261 type=str,
262 default=None,
263 help="Destination spk id for clipping",
264 )
265 parser.add_argument(
266 "--start_ost",
267 type=int,
268 default=0,
269 help="Offset time in ms at beginning for clipping"
270 )
271 parser.add_argument(
272 "--end_ost",
273 type=int,
274 default=0,
275 help="Offset time in ms at ending for clipping"
276 )
277 parser.add_argument(
278 "--output_file",
279 type=str,
280 default=None,
281 help="Output file path"
282 )
283 return parser
284
285
286 def runner(stage, file, sd_switch, output_dir, dest_text, dest_spk, start_ost, end_ost, output_file, config=None):
287 audio_suffixs = ['.wav','.mp3','.aac','.m4a','.flac']
288 video_suffixs = ['.mp4','.avi','.mkv','.flv','.mov','.webm','.ts','.mpeg']
289 _,ext = os.path.splitext(file)
290 if ext.lower() in audio_suffixs:
291 mode = 'audio'
292 elif ext.lower() in video_suffixs:
293 mode = 'video'
294 else:
295 logging.error("Unsupported file format: {}\n\nplease choise one of the following: {}".format(file),audio_suffixs+video_suffixs)
296 sys.exit(1) # exit if the file is not supported
297 while output_dir.endswith('/'):
298 output_dir = output_dir[:-1]
299 if not os.path.exists(output_dir):
300 os.mkdir(output_dir)
301 if stage == 1:
302 from funasr import AutoModel
303 # initialize funasr automodel
304 logging.warning("Initializing modelscope asr pipeline.")
305 funasr_model = AutoModel(model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
306 model_revision="v2.0.4",
307 vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
308 vad_model_revision="v2.0.4",
309 punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
310 punc_model_revision="v2.0.4",
311 spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
312 spk_model_revision="v2.0.2",
313 )
314 audio_clipper = VideoClipper(funasr_model)
315 if mode == 'audio':
316 logging.warning("Recognizing audio file: {}".format(file))
317 wav, sr = librosa.load(file, sr=16000)
318 res_text, res_srt, state = audio_clipper.recog((sr, wav), sd_switch)
319 if mode == 'video':
320 logging.warning("Recognizing video file: {}".format(file))
321 res_text, res_srt, state = audio_clipper.video_recog(file, sd_switch)
322 total_srt_file = output_dir + '/total.srt'
323 with open(total_srt_file, 'w') as fout:
324 fout.write(res_srt)
325 logging.warning("Write total subtitle to {}".format(total_srt_file))
326 write_state(output_dir, state)
327 logging.warning("Recognition successed. You can copy the text segment from below and use stage 2.")
328 print(res_text)
329 if stage == 2:
330 audio_clipper = VideoClipper(None)
331 if mode == 'audio':
332 state = load_state(output_dir)
333 wav, sr = librosa.load(file, sr=16000)
334 state['audio_input'] = (sr, wav)
335 (sr, audio), message, srt_clip = audio_clipper.clip(dest_text, start_ost, end_ost, state, dest_spk=dest_spk)
336 if output_file is None:
337 output_file = output_dir + '/result.wav'
338 clip_srt_file = output_file[:-3] + 'srt'
339 logging.warning(message)
340 sf.write(output_file, audio, 16000)
341 assert output_file.endswith('.wav'), "output_file must ends with '.wav'"
342 logging.warning("Save clipped wav file to {}".format(output_file))
343 with open(clip_srt_file, 'w') as fout:
344 fout.write(srt_clip)
345 logging.warning("Write clipped subtitle to {}".format(clip_srt_file))
346 if mode == 'video':
347 state = load_state(output_dir)
348 state['vedio_filename'] = file
349 if output_file is None:
350 state['clip_video_file'] = file[:-4] + '_clip.mp4'
351 else:
352 state['clip_video_file'] = output_file
353 clip_srt_file = state['clip_video_file'][:-3] + 'srt'
354 state['video'] = mpy.VideoFileClip(file)
355 clip_video_file, message, srt_clip = audio_clipper.video_clip(dest_text, start_ost, end_ost, state, dest_spk=dest_spk)
356 logging.warning("Clipping Log: {}".format(message))
357 logging.warning("Save clipped mp4 file to {}".format(clip_video_file))
358 with open(clip_srt_file, 'w') as fout:
359 fout.write(srt_clip)
360 logging.warning("Write clipped subtitle to {}".format(clip_srt_file))
361
362
363 def main(cmd=None):
364 print(get_commandline_args(), file=sys.stderr)
365 parser = get_parser()
366 args = parser.parse_args(cmd)
367 kwargs = vars(args)
368 runner(**kwargs)
369
370
371 if __name__ == '__main__':
372 main()
...\ No newline at end of file ...\ No newline at end of file
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!