launch.py 8.29 KB
import gradio as gr
from funasr import AutoModel
from videoclipper import VideoClipper

if __name__ == "__main__":
    funasr_model = AutoModel(model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
                             model_revision="v2.0.4",
                             vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
                             vad_model_revision="v2.0.4",
                             punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
                             punc_model_revision="v2.0.4",
                             spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
                             spk_model_revision="v2.0.2",
                             )
    audio_clipper = VideoClipper(funasr_model)


    def audio_recog(audio_input, sd_switch, hotwords):
        # import pdb; pdb.set_trace()
        print(audio_input)
        return audio_clipper.recog(audio_input, sd_switch, hotwords=hotwords)


    def audio_clip(dest_text, audio_spk_input, start_ost, end_ost, state):
        return audio_clipper.clip(dest_text, start_ost, end_ost, state, dest_spk=audio_spk_input)


    def video_recog(video_input, sd_switch, hotwords):
        return audio_clipper.video_recog(video_input, sd_switch, hotwords)


    def video_clip(dest_text, video_spk_input, start_ost, end_ost, state):
        return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, dest_spk=video_spk_input)


    def video_clip_addsub(dest_text, video_spk_input, start_ost, end_ost, state, font_size, font_color):
        return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, font_size, font_color, add_sub=True,
                                        dest_spk=video_spk_input)


    top_md_1 = ("""
     **<font color="#1785c4"></font>** 
    <div align="center">
    <div style="display:flex; gap: 0.25rem;" align="center"> 
    </div>
    </div>
    """)

    top_md_2 = ("""
    <div align="center">
    <div style="display:flex; gap: 0.25rem;" align="center">
    </div>
    </div>
    """)
    
    top_md_3 = ("""
    * Step1: 上传视频或音频文件(或使用下方的用例体验),点击 **<font color="#f7802b">识别</font>** 按钮
    * Step2: 复制识别结果中所需的文字至右上方,或者右设置说话人标识,设置偏移与字幕配置(可选)
    * Step3: 点击 **<font color="#f7802b">裁剪</font>** 按钮或 **<font color="#f7802b">裁剪并添加字幕</font>** 按钮获得结果
    """)

    # gradio interface
    with gr.Blocks() as demo:
        # gr.Image("./examples/guide.png", show_label=False)
        gr.Markdown(top_md_1)
        gr.Markdown(top_md_2)
        gr.Markdown(top_md_3)
        video_state = gr.State()
        audio_state = gr.State()
        with gr.Tab("🎥✂️视频裁剪 Video Clipping"):
            with gr.Row():
                with gr.Column():
                    video_input = gr.Video(label="🎥视频输入 Video Input")
                    with gr.Row():
                        video_sd_switch = gr.Radio(["no", "yes"], label="👥是否区分说话人 Recognize Speakers",
                                                   value='no')
                        hotwords_input = gr.Textbox(label="🚒热词 Hotwords")
                    recog_button2 = gr.Button("👂识别 Recognize")
                    video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
                    video_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
                with gr.Column():
                    video_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
                    video_spk_input = gr.Textbox(label="✏️待裁剪说话人 Speaker to Clip (多个说话人使用'#'连接)")
                    with gr.Row():
                        video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50,
                                                    label="⏪开始位置偏移 Start Offset (ms)")
                        video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50,
                                                  label="⏩结束位置偏移 End Offset (ms)")
                    with gr.Row():
                        font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2,
                                              label="🔠字幕字体大小 Subtitle Font Size")
                        font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color",
                                              value='white')
                        # font = gr.Radio(["黑体", "Alibaba Sans"], label="字体 Font")
                    with gr.Row():
                        clip_button2 = gr.Button("✂️裁剪\nClip")
                        clip_button3 = gr.Button("✂️裁剪并添加字幕\nClip and Generate Subtitles")
                    video_output = gr.Video(label="🎥裁剪结果 Audio Clipped")
                    video_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
                    video_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")

        with gr.Tab("🔊✂️音频裁剪 Audio Clipping"):
            with gr.Row():
                with gr.Column():
                    audio_input = gr.Audio(label="🔊音频输入 Audio Input")
                    with gr.Row():
                        audio_sd_switch = gr.Radio(["no", "yes"], label="👥是否区分说话人 Recognize Speakers",
                                                   value='no')
                        hotwords_input2 = gr.Textbox(label="🚒热词 Hotwords")
                    recog_button1 = gr.Button("👂识别 Recognize")
                    audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
                    audio_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
                with gr.Column():
                    audio_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
                    audio_spk_input = gr.Textbox(label="✏️待裁剪说话人 Speaker to Clip (多个说话人使用'#'连接)")
                    with gr.Row():
                        audio_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50,
                                                    label="⏪开始位置偏移 Start Offset (ms)")
                        audio_end_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50,
                                                  label="⏩结束位置偏移 End Offset (ms)")
                    with gr.Row():
                        clip_button1 = gr.Button("✂️裁剪 Clip")
                    audio_output = gr.Audio(label="🔊裁剪结果 Audio Clipped")
                    audio_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
                    audio_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")

        recog_button1.click(audio_recog,
                            inputs=[audio_input, audio_sd_switch, hotwords_input2],
                            outputs=[audio_text_output, audio_srt_output, audio_state])
        clip_button1.click(audio_clip,
                           inputs=[audio_text_input, audio_spk_input, audio_start_ost, audio_end_ost, audio_state],
                           outputs=[audio_output, audio_mess_output, audio_srt_clip_output])
        
        recog_button2.click(video_recog,
                            inputs=[video_input, video_sd_switch, hotwords_input],
                            outputs=[video_text_output, video_srt_output, video_state])
        clip_button2.click(video_clip,
                           inputs=[video_text_input, video_spk_input, video_start_ost, video_end_ost, video_state],
                           outputs=[video_output, video_mess_output, video_srt_clip_output])
        clip_button3.click(video_clip_addsub,
                           inputs=[video_text_input, video_spk_input, video_start_ost, video_end_ost, video_state,
                                   font_size, font_color],
                           outputs=[video_output, video_mess_output, video_srt_clip_output])

    # start gradio service in local
    demo.launch()