3de5c1e5 by 籽li

first

0 parents
File mode changed
1 import argparse
2 from pathlib import Path
3
4 import yaml
5 import sys
6
7
8 class ArgumentParser(argparse.ArgumentParser):
9 """Simple implementation of ArgumentParser supporting config file
10
11 This class is originated from https://github.com/bw2/ConfigArgParse,
12 but this class is lack of some features that it has.
13
14 - Not supporting multiple config files
15 - Automatically adding "--config" as an option.
16 - Not supporting any formats other than yaml
17 - Not checking argument type
18
19 """
20
21 def __init__(self, *args, **kwargs):
22 super().__init__(*args, **kwargs)
23 self.add_argument("--config", help="Give config file in yaml format")
24
25 def parse_known_args(self, args=None, namespace=None):
26 # Once parsing for setting from "--config"
27 _args, _ = super().parse_known_args(args, namespace)
28 if _args.config is not None:
29 if not Path(_args.config).exists():
30 self.error(f"No such file: {_args.config}")
31
32 with open(_args.config, "r", encoding="utf-8") as f:
33 d = yaml.safe_load(f)
34 if not isinstance(d, dict):
35 self.error("Config file has non dict value: {_args.config}")
36
37 for key in d:
38 for action in self._actions:
39 if key == action.dest:
40 break
41 else:
42 self.error(f"unrecognized arguments: {key} (from {_args.config})")
43
44 # NOTE(kamo): Ignore "--config" from a config file
45 # NOTE(kamo): Unlike "configargparse", this module doesn't check type.
46 # i.e. We can set any type value regardless of argument type.
47 self.set_defaults(**d)
48 return super().parse_known_args(args, namespace)
49
50
51 def get_commandline_args():
52 extra_chars = [
53 " ",
54 ";",
55 "&",
56 "(",
57 ")",
58 "|",
59 "^",
60 "<",
61 ">",
62 "?",
63 "*",
64 "[",
65 "]",
66 "$",
67 "`",
68 '"',
69 "\\",
70 "!",
71 "{",
72 "}",
73 ]
74
75 # Escape the extra characters for shell
76 argv = [
77 arg.replace("'", "'\\''")
78 if all(char not in arg for char in extra_chars)
79 else "'" + arg.replace("'", "'\\''") + "'"
80 for arg in sys.argv
81 ]
82
83 return sys.executable + " " + " ".join(argv)
...\ No newline at end of file ...\ No newline at end of file
1 from moviepy.editor import *
2 from moviepy.video.tools.subtitles import SubtitlesClip
3
4 generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=48, color='white')
5 subs = [((0, 2), 'sub1中文字幕'),
6 ((2, 4), 'subs2'),
7 ((4, 6), 'subs3'),
8 ((6, 8), 'subs4')]
9
10 subtitles = SubtitlesClip(subs, generator)
11
12 video = VideoFileClip("examples/2022云栖大会_片段.mp4.mp4")
13 video = video.subclip(0, 8)
14 video = CompositeVideoClip([video, subtitles.set_pos(('center','bottom'))])
15
16 video.write_videofile("test_output.mp4")
...\ No newline at end of file ...\ No newline at end of file
1 import gradio as gr
2 from funasr import AutoModel
3 from videoclipper import VideoClipper
4
5 if __name__ == "__main__":
6 funasr_model = AutoModel(model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
7 model_revision="v2.0.4",
8 vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
9 vad_model_revision="v2.0.4",
10 punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
11 punc_model_revision="v2.0.4",
12 spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
13 spk_model_revision="v2.0.2",
14 )
15 audio_clipper = VideoClipper(funasr_model)
16
17
18 def audio_recog(audio_input, sd_switch, hotwords):
19 # import pdb; pdb.set_trace()
20 print(audio_input)
21 return audio_clipper.recog(audio_input, sd_switch, hotwords=hotwords)
22
23
24 def audio_clip(dest_text, audio_spk_input, start_ost, end_ost, state):
25 return audio_clipper.clip(dest_text, start_ost, end_ost, state, dest_spk=audio_spk_input)
26
27
28 def video_recog(video_input, sd_switch, hotwords):
29 return audio_clipper.video_recog(video_input, sd_switch, hotwords)
30
31
32 def video_clip(dest_text, video_spk_input, start_ost, end_ost, state):
33 return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, dest_spk=video_spk_input)
34
35
36 def video_clip_addsub(dest_text, video_spk_input, start_ost, end_ost, state, font_size, font_color):
37 return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, font_size, font_color, add_sub=True,
38 dest_spk=video_spk_input)
39
40
41 top_md_1 = ("""
42 **<font color="#1785c4"></font>**
43 <div align="center">
44 <div style="display:flex; gap: 0.25rem;" align="center">
45 </div>
46 </div>
47 """)
48
49 top_md_2 = ("""
50 <div align="center">
51 <div style="display:flex; gap: 0.25rem;" align="center">
52 </div>
53 </div>
54 """)
55
56 top_md_3 = ("""
57 * Step1: 上传视频或音频文件(或使用下方的用例体验),点击 **<font color="#f7802b">识别</font>** 按钮
58 * Step2: 复制识别结果中所需的文字至右上方,或者右设置说话人标识,设置偏移与字幕配置(可选)
59 * Step3: 点击 **<font color="#f7802b">裁剪</font>** 按钮或 **<font color="#f7802b">裁剪并添加字幕</font>** 按钮获得结果
60 """)
61
62 # gradio interface
63 with gr.Blocks() as demo:
64 # gr.Image("./examples/guide.png", show_label=False)
65 gr.Markdown(top_md_1)
66 gr.Markdown(top_md_2)
67 gr.Markdown(top_md_3)
68 video_state = gr.State()
69 audio_state = gr.State()
70 with gr.Tab("🎥✂️视频裁剪 Video Clipping"):
71 with gr.Row():
72 with gr.Column():
73 video_input = gr.Video(label="🎥视频输入 Video Input")
74 with gr.Row():
75 video_sd_switch = gr.Radio(["no", "yes"], label="👥是否区分说话人 Recognize Speakers",
76 value='no')
77 hotwords_input = gr.Textbox(label="🚒热词 Hotwords")
78 recog_button2 = gr.Button("👂识别 Recognize")
79 video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
80 video_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
81 with gr.Column():
82 video_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
83 video_spk_input = gr.Textbox(label="✏️待裁剪说话人 Speaker to Clip (多个说话人使用'#'连接)")
84 with gr.Row():
85 video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50,
86 label="⏪开始位置偏移 Start Offset (ms)")
87 video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50,
88 label="⏩结束位置偏移 End Offset (ms)")
89 with gr.Row():
90 font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2,
91 label="🔠字幕字体大小 Subtitle Font Size")
92 font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color",
93 value='white')
94 # font = gr.Radio(["黑体", "Alibaba Sans"], label="字体 Font")
95 with gr.Row():
96 clip_button2 = gr.Button("✂️裁剪\nClip")
97 clip_button3 = gr.Button("✂️裁剪并添加字幕\nClip and Generate Subtitles")
98 video_output = gr.Video(label="🎥裁剪结果 Audio Clipped")
99 video_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
100 video_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
101
102 with gr.Tab("🔊✂️音频裁剪 Audio Clipping"):
103 with gr.Row():
104 with gr.Column():
105 audio_input = gr.Audio(label="🔊音频输入 Audio Input")
106 with gr.Row():
107 audio_sd_switch = gr.Radio(["no", "yes"], label="👥是否区分说话人 Recognize Speakers",
108 value='no')
109 hotwords_input2 = gr.Textbox(label="🚒热词 Hotwords")
110 recog_button1 = gr.Button("👂识别 Recognize")
111 audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
112 audio_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
113 with gr.Column():
114 audio_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
115 audio_spk_input = gr.Textbox(label="✏️待裁剪说话人 Speaker to Clip (多个说话人使用'#'连接)")
116 with gr.Row():
117 audio_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50,
118 label="⏪开始位置偏移 Start Offset (ms)")
119 audio_end_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50,
120 label="⏩结束位置偏移 End Offset (ms)")
121 with gr.Row():
122 clip_button1 = gr.Button("✂️裁剪 Clip")
123 audio_output = gr.Audio(label="🔊裁剪结果 Audio Clipped")
124 audio_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
125 audio_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
126
127 recog_button1.click(audio_recog,
128 inputs=[audio_input, audio_sd_switch, hotwords_input2],
129 outputs=[audio_text_output, audio_srt_output, audio_state])
130 clip_button1.click(audio_clip,
131 inputs=[audio_text_input, audio_spk_input, audio_start_ost, audio_end_ost, audio_state],
132 outputs=[audio_output, audio_mess_output, audio_srt_clip_output])
133
134 recog_button2.click(video_recog,
135 inputs=[video_input, video_sd_switch, hotwords_input],
136 outputs=[video_text_output, video_srt_output, video_state])
137 clip_button2.click(video_clip,
138 inputs=[video_text_input, video_spk_input, video_start_ost, video_end_ost, video_state],
139 outputs=[video_output, video_mess_output, video_srt_clip_output])
140 clip_button3.click(video_clip_addsub,
141 inputs=[video_text_input, video_spk_input, video_start_ost, video_end_ost, video_state,
142 font_size, font_color],
143 outputs=[video_output, video_mess_output, video_srt_clip_output])
144
145 # start gradio service in local
146 demo.launch()
1 def time_convert(ms):
2 ms = int(ms)
3 tail = ms % 1000
4 s = ms // 1000
5 mi = s // 60
6 s = s % 60
7 h = mi // 60
8 mi = mi % 60
9 h = "00" if h == 0 else str(h)
10 mi = "00" if mi == 0 else str(mi)
11 s = "00" if s == 0 else str(s)
12 tail = str(tail)
13 if len(h) == 1: h = '0' + h
14 if len(mi) == 1: mi = '0' + mi
15 if len(s) == 1: s = '0' + s
16 return "{}:{}:{},{}".format(h, mi, s, tail)
17
18
19 class Text2SRT():
20 def __init__(self, text, timestamp, offset=0):
21 self.token_list = [i for i in text.split() if len(i)]
22 self.timestamp = timestamp
23 start, end = timestamp[0][0] - offset, timestamp[-1][1] - offset
24 self.start_sec, self.end_sec = start, end
25 self.start_time = time_convert(start)
26 self.end_time = time_convert(end)
27 def text(self):
28 res = ""
29 for word in self.token_list:
30 if '\u4e00' <= word <= '\u9fff':
31 res += word
32 else:
33 res += " " + word
34 return res
35 def len(self):
36 return len(self.token_list)
37 def srt(self, acc_ost=0.0):
38 return "{} --> {}\n{}\n".format(
39 time_convert(self.start_sec+acc_ost*1000),
40 time_convert(self.end_sec+acc_ost*1000),
41 self.text())
42 def time(self, acc_ost=0.0):
43 return (self.start_sec/1000+acc_ost, self.end_sec/1000+acc_ost)
44
45
46 def generate_srt(sentence_list):
47 srt_total = ''
48 for i, d in enumerate(sentence_list):
49 t2s = Text2SRT(d['text'], d['timestamp'])
50 if 'spk' in d:
51 srt_total += "{} spk{}\n{}".format(i, d['spk'], t2s.srt())
52 else:
53 srt_total += "{}\n{}".format(i, t2s.srt())
54 return srt_total
55
56 def generate_srt_clip(sentence_list, start, end, begin_index=0, time_acc_ost=0.0):
57 start, end = int(start * 1000), int(end * 1000)
58 srt_total = ''
59 cc = 1 + begin_index
60 subs = []
61 for i, d in enumerate(sentence_list):
62 if d['timestamp'][-1][1] <= start:
63 continue
64 if d['timestamp'][0][0] >= end:
65 break
66 # parts in between
67 if (d['timestamp'][-1][1] <= end and d['timestamp'][0][0] > start) or (d['timestamp'][-1][1] == end and d['timestamp'][0][0] == start):
68 t2s = Text2SRT(d['text'], d['timestamp'], offset=start)
69 srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
70 subs.append((t2s.time(time_acc_ost), t2s.text()))
71 cc += 1
72 continue
73 if d['timestamp'][0][0] <= start:
74 if not d['timestamp'][-1][1] > end:
75 for j, ts in enumerate(d['timestamp']):
76 if ts[1] > start:
77 break
78 _text = " ".join(d['text'].split()[j:])
79 _ts = d['timestamp'][j:]
80 else:
81 for j, ts in enumerate(d['timestamp']):
82 if ts[1] > start:
83 _start = j
84 break
85 for j, ts in enumerate(d['timestamp']):
86 if ts[1] > end:
87 _end = j
88 break
89 _text = " ".join(d['text'].split()[_start:_end])
90 _ts = d['timestamp'][_start:_end]
91 if len(ts):
92 t2s = Text2SRT(_text, _ts, offset=start)
93 srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
94 subs.append((t2s.time(time_acc_ost), t2s.text()))
95 cc += 1
96 continue
97 if d['timestamp'][-1][1] > end:
98 for j, ts in enumerate(d['timestamp']):
99 if ts[1] > end:
100 break
101 _text = " ".join(d['text'].split()[:j])
102 _ts = d['timestamp'][:j]
103 if len(_ts):
104 t2s = Text2SRT(_text, _ts, offset=start)
105 srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
106 subs.append(
107 (t2s.time(time_acc_ost), t2s.text())
108 )
109 cc += 1
110 continue
111 return srt_total, subs, cc
1 # step1: Recognize
2 python videoclipper.py --stage 1 \
3 --file ../examples/2022云栖大会_片段.mp4 \
4 --sd_switch yes \
5 --output_dir ./output
6 # now you can find recognition results and entire SRT file in ./output/
7 # step2: Clip
8 python videoclipper.py --stage 2 \
9 --file ../examples/2022云栖大会_片段.mp4 \
10 --output_dir ./output \
11 --dest_text '所以这个是我们办这个奖的初心啊,我们也会一届一届的办下去' \
12 # --dest_spk spk0 \
13 --start_ost 0 \
14 --end_ost 100 \
15 --output_file './output/res.mp4'
...\ No newline at end of file ...\ No newline at end of file
1 PUNC_LIST = [',', '。', '!', '?', '、']
2
3
4 def pre_proc(text):
5 res = ''
6 for i in range(len(text)):
7 if text[i] in PUNC_LIST:
8 continue
9 if '\u4e00' <= text[i] <= '\u9fff':
10 if len(res) and res[-1] != " ":
11 res += ' ' + text[i]+' '
12 else:
13 res += text[i]+' '
14 else:
15 res += text[i]
16 if res[-1] == ' ':
17 res = res[:-1]
18 return res
19
20 def proc(raw_text, timestamp, dest_text):
21 # simple matching
22 ld = len(dest_text.split())
23 mi, ts = [], []
24 offset = 0
25 while True:
26 fi = raw_text.find(dest_text, offset, len(raw_text))
27 # import pdb; pdb.set_trace()
28 ti = raw_text[:fi].count(' ')
29 if fi == -1:
30 break
31 offset = fi + ld
32 mi.append(fi)
33 ts.append([timestamp[ti][0]*16, timestamp[ti+ld-1][1]*16])
34 # import pdb; pdb.set_trace()
35 return ts
36
37 def proc_spk(dest_spk, sd_sentences):
38 ts = []
39 for d in sd_sentences:
40 d_start = d['timestamp'][0][0]
41 d_end = d['timestamp'][-1][1]
42 spkid=dest_spk[3:]
43 # import pdb; pdb.set_trace()
44 if str(d['spk']) == spkid and d_end-d_start>999:
45 ts.append([d['start']*16, d['end']*16])
46 return ts
47
48 def generate_vad_data(data, sd_sentences, sr=16000):
49 assert len(data.shape) == 1
50 vad_data = []
51 for d in sd_sentences:
52 d_start = round(d['ts_list'][0][0]/1000, 2)
53 d_end = round(d['ts_list'][-1][1]/1000, 2)
54 vad_data.append([d_start, d_end, data[int(d_start * sr):int(d_end * sr)]])
55 return vad_data
56
57 def write_state(output_dir, state):
58 for key in ['/recog_res_raw', '/timestamp', '/sentences']:#, '/sd_sentences']:
59 with open(output_dir+key, 'w') as fout:
60 fout.write(str(state[key[1:]]))
61 if 'sd_sentences' in state:
62 with open(output_dir+'/sd_sentences', 'w') as fout:
63 fout.write(str(state['sd_sentences']))
64
65 import os
66 def load_state(output_dir):
67 state = {}
68 with open(output_dir+'/recog_res_raw') as fin:
69 line = fin.read()
70 state['recog_res_raw'] = line
71 with open(output_dir+'/timestamp') as fin:
72 line = fin.read()
73 state['timestamp'] = eval(line)
74 with open(output_dir+'/sentences') as fin:
75 line = fin.read()
76 state['sentences'] = eval(line)
77 if os.path.exists(output_dir+'/sd_sentences'):
78 with open(output_dir+'/sd_sentences') as fin:
79 line = fin.read()
80 state['sd_sentences'] = eval(line)
81 return state
82
83 import numpy as np
84 def convert_pcm_to_float(data):
85 if data.dtype == np.float64:
86 return data
87 elif data.dtype == np.float32:
88 return data.astype(np.float64)
89 elif data.dtype == np.int16:
90 bit_depth = 16
91 elif data.dtype == np.int32:
92 bit_depth = 32
93 elif data.dtype == np.int8:
94 bit_depth = 8
95 else:
96 raise ValueError("Unsupported audio data type")
97
98 # Now handle the integer types
99 max_int_value = float(2 ** (bit_depth - 1))
100 if bit_depth == 8:
101 data = data - 128
102 return (data.astype(np.float64) / max_int_value)
103
104
105
...\ No newline at end of file ...\ No newline at end of file
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!