launch.py
8.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import gradio as gr
from funasr import AutoModel
from videoclipper import VideoClipper
if __name__ == "__main__":
funasr_model = AutoModel(model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
model_revision="v2.0.4",
vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
vad_model_revision="v2.0.4",
punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
punc_model_revision="v2.0.4",
spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
spk_model_revision="v2.0.2",
)
audio_clipper = VideoClipper(funasr_model)
def audio_recog(audio_input, sd_switch, hotwords):
# import pdb; pdb.set_trace()
print(audio_input)
return audio_clipper.recog(audio_input, sd_switch, hotwords=hotwords)
def audio_clip(dest_text, audio_spk_input, start_ost, end_ost, state):
return audio_clipper.clip(dest_text, start_ost, end_ost, state, dest_spk=audio_spk_input)
def video_recog(video_input, sd_switch, hotwords):
return audio_clipper.video_recog(video_input, sd_switch, hotwords)
def video_clip(dest_text, video_spk_input, start_ost, end_ost, state):
return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, dest_spk=video_spk_input)
def video_clip_addsub(dest_text, video_spk_input, start_ost, end_ost, state, font_size, font_color):
return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, font_size, font_color, add_sub=True,
dest_spk=video_spk_input)
top_md_1 = ("""
**<font color="#1785c4"></font>**
<div align="center">
<div style="display:flex; gap: 0.25rem;" align="center">
</div>
</div>
""")
top_md_2 = ("""
<div align="center">
<div style="display:flex; gap: 0.25rem;" align="center">
</div>
</div>
""")
top_md_3 = ("""
* Step1: 上传视频或音频文件(或使用下方的用例体验),点击 **<font color="#f7802b">识别</font>** 按钮
* Step2: 复制识别结果中所需的文字至右上方,或者右设置说话人标识,设置偏移与字幕配置(可选)
* Step3: 点击 **<font color="#f7802b">裁剪</font>** 按钮或 **<font color="#f7802b">裁剪并添加字幕</font>** 按钮获得结果
""")
# gradio interface
with gr.Blocks() as demo:
# gr.Image("./examples/guide.png", show_label=False)
gr.Markdown(top_md_1)
gr.Markdown(top_md_2)
gr.Markdown(top_md_3)
video_state = gr.State()
audio_state = gr.State()
with gr.Tab("🎥✂️视频裁剪 Video Clipping"):
with gr.Row():
with gr.Column():
video_input = gr.Video(label="🎥视频输入 Video Input")
with gr.Row():
video_sd_switch = gr.Radio(["no", "yes"], label="👥是否区分说话人 Recognize Speakers",
value='no')
hotwords_input = gr.Textbox(label="🚒热词 Hotwords")
recog_button2 = gr.Button("👂识别 Recognize")
video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
video_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
with gr.Column():
video_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
video_spk_input = gr.Textbox(label="✏️待裁剪说话人 Speaker to Clip (多个说话人使用'#'连接)")
with gr.Row():
video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50,
label="⏪开始位置偏移 Start Offset (ms)")
video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50,
label="⏩结束位置偏移 End Offset (ms)")
with gr.Row():
font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2,
label="🔠字幕字体大小 Subtitle Font Size")
font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color",
value='white')
# font = gr.Radio(["黑体", "Alibaba Sans"], label="字体 Font")
with gr.Row():
clip_button2 = gr.Button("✂️裁剪\nClip")
clip_button3 = gr.Button("✂️裁剪并添加字幕\nClip and Generate Subtitles")
video_output = gr.Video(label="🎥裁剪结果 Audio Clipped")
video_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
video_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
with gr.Tab("🔊✂️音频裁剪 Audio Clipping"):
with gr.Row():
with gr.Column():
audio_input = gr.Audio(label="🔊音频输入 Audio Input")
with gr.Row():
audio_sd_switch = gr.Radio(["no", "yes"], label="👥是否区分说话人 Recognize Speakers",
value='no')
hotwords_input2 = gr.Textbox(label="🚒热词 Hotwords")
recog_button1 = gr.Button("👂识别 Recognize")
audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
audio_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
with gr.Column():
audio_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
audio_spk_input = gr.Textbox(label="✏️待裁剪说话人 Speaker to Clip (多个说话人使用'#'连接)")
with gr.Row():
audio_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50,
label="⏪开始位置偏移 Start Offset (ms)")
audio_end_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50,
label="⏩结束位置偏移 End Offset (ms)")
with gr.Row():
clip_button1 = gr.Button("✂️裁剪 Clip")
audio_output = gr.Audio(label="🔊裁剪结果 Audio Clipped")
audio_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
audio_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
recog_button1.click(audio_recog,
inputs=[audio_input, audio_sd_switch, hotwords_input2],
outputs=[audio_text_output, audio_srt_output, audio_state])
clip_button1.click(audio_clip,
inputs=[audio_text_input, audio_spk_input, audio_start_ost, audio_end_ost, audio_state],
outputs=[audio_output, audio_mess_output, audio_srt_clip_output])
recog_button2.click(video_recog,
inputs=[video_input, video_sd_switch, hotwords_input],
outputs=[video_text_output, video_srt_output, video_state])
clip_button2.click(video_clip,
inputs=[video_text_input, video_spk_input, video_start_ost, video_end_ost, video_state],
outputs=[video_output, video_mess_output, video_srt_clip_output])
clip_button3.click(video_clip_addsub,
inputs=[video_text_input, video_spk_input, video_start_ost, video_end_ost, video_state,
font_size, font_color],
outputs=[video_output, video_mess_output, video_srt_clip_output])
# start gradio service in local
demo.launch()