first

籽li
Showing 12 changed files with 476 additions and 0 deletions
funclip/__init__.py
funclip/__pycache__/argparse_tools.cpython-310.pyc
funclip/__pycache__/subtitle_utils.cpython-310.pyc
funclip/__pycache__/trans_utils.cpython-310.pyc
funclip/__pycache__/videoclipper.cpython-310.pyc
funclip/argparse_tools.py
funclip/imagemagick_test.py
funclip/launch.py
funclip/subtitle_utils.py
funclip/test.sh
funclip/trans_utils.py
funclip/videoclipper.py
--- a/funclip/__init__.py 0 → 100644
View file @3de5c1e
+++ b/funclip/__init__.py 0 → 100644
View file @3de5c1e
--- a/funclip/__pycache__/argparse_tools.cpython-310.pyc 0 → 100644
View file @3de5c1e
+++ b/funclip/__pycache__/argparse_tools.cpython-310.pyc 0 → 100644
View file @3de5c1e
--- a/funclip/__pycache__/subtitle_utils.cpython-310.pyc 0 → 100644
View file @3de5c1e
+++ b/funclip/__pycache__/subtitle_utils.cpython-310.pyc 0 → 100644
View file @3de5c1e
--- a/funclip/__pycache__/trans_utils.cpython-310.pyc 0 → 100644
View file @3de5c1e
+++ b/funclip/__pycache__/trans_utils.cpython-310.pyc 0 → 100644
View file @3de5c1e
--- a/funclip/__pycache__/videoclipper.cpython-310.pyc 0 → 100644
View file @3de5c1e
+++ b/funclip/__pycache__/videoclipper.cpython-310.pyc 0 → 100644
View file @3de5c1e
--- a/funclip/argparse_tools.py 0 → 100644
View file @3de5c1e
+++ b/funclip/argparse_tools.py 0 → 100644
View file @3de5c1e
+import argparse
+from pathlib import Path
+
+import yaml
+import sys
+
+
+class ArgumentParser(argparse.ArgumentParser):
+    """Simple implementation of ArgumentParser supporting config file
+
+    This class is originated from https://github.com/bw2/ConfigArgParse,
+    but this class is lack of some features that it has.
+
+    - Not supporting multiple config files
+    - Automatically adding "--config" as an option.
+    - Not supporting any formats other than yaml
+    - Not checking argument type
+
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.add_argument("--config", help="Give config file in yaml format")
+
+    def parse_known_args(self, args=None, namespace=None):
+        # Once parsing for setting from "--config"
+        _args, _ = super().parse_known_args(args, namespace)
+        if _args.config is not None:
+            if not Path(_args.config).exists():
+                self.error(f"No such file: {_args.config}")
+
+            with open(_args.config, "r", encoding="utf-8") as f:
+                d = yaml.safe_load(f)
+            if not isinstance(d, dict):
+                self.error("Config file has non dict value: {_args.config}")
+
+            for key in d:
+                for action in self._actions:
+                    if key == action.dest:
+                        break
+                else:
+                    self.error(f"unrecognized arguments: {key} (from {_args.config})")
+
+            # NOTE(kamo): Ignore "--config" from a config file
+            # NOTE(kamo): Unlike "configargparse", this module doesn't check type.
+            #   i.e. We can set any type value regardless of argument type.
+            self.set_defaults(**d)
+        return super().parse_known_args(args, namespace)
+
+
+def get_commandline_args():
+    extra_chars = [
+        " ",
+        ";",
+        "&",
+        "(",
+        ")",
+        "|",
+        "^",
+        "<",
+        ">",
+        "?",
+        "*",
+        "[",
+        "]",
+        "$",
+        "`",
+        '"',
+        "\\",
+        "!",
+        "{",
+        "}",
+    ]
+
+    # Escape the extra characters for shell
+    argv = [
+        arg.replace("'", "'\\''")
+        if all(char not in arg for char in extra_chars)
+        else "'" + arg.replace("'", "'\\''") + "'"
+        for arg in sys.argv
+    ]
+
+    return sys.executable + " " + " ".join(argv)
\ No newline at end of file
--- a/funclip/imagemagick_test.py 0 → 100644
View file @3de5c1e
+++ b/funclip/imagemagick_test.py 0 → 100644
View file @3de5c1e
+from moviepy.editor import *
+from moviepy.video.tools.subtitles import SubtitlesClip
+
+generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=48, color='white')
+subs = [((0, 2), 'sub1中文字幕'),
+        ((2, 4), 'subs2'),
+        ((4, 6), 'subs3'),
+        ((6, 8), 'subs4')]
+
+subtitles = SubtitlesClip(subs, generator)
+
+video = VideoFileClip("examples/2022云栖大会_片段.mp4.mp4")
+video = video.subclip(0, 8)
+video = CompositeVideoClip([video, subtitles.set_pos(('center','bottom'))])
+
+video.write_videofile("test_output.mp4")
\ No newline at end of file
--- a/funclip/launch.py 0 → 100644
View file @3de5c1e
+++ b/funclip/launch.py 0 → 100644
View file @3de5c1e
+import gradio as gr
+from funasr import AutoModel
+from videoclipper import VideoClipper
+
+if __name__ == "__main__":
+    funasr_model = AutoModel(model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+                             model_revision="v2.0.4",
+                             vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+                             vad_model_revision="v2.0.4",
+                             punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+                             punc_model_revision="v2.0.4",
+                             spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
+                             spk_model_revision="v2.0.2",
+                             )
+    audio_clipper = VideoClipper(funasr_model)
+
+
+    def audio_recog(audio_input, sd_switch, hotwords):
+        # import pdb; pdb.set_trace()
+        print(audio_input)
+        return audio_clipper.recog(audio_input, sd_switch, hotwords=hotwords)
+
+
+    def audio_clip(dest_text, audio_spk_input, start_ost, end_ost, state):
+        return audio_clipper.clip(dest_text, start_ost, end_ost, state, dest_spk=audio_spk_input)
+
+
+    def video_recog(video_input, sd_switch, hotwords):
+        return audio_clipper.video_recog(video_input, sd_switch, hotwords)
+
+
+    def video_clip(dest_text, video_spk_input, start_ost, end_ost, state):
+        return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, dest_spk=video_spk_input)
+
+
+    def video_clip_addsub(dest_text, video_spk_input, start_ost, end_ost, state, font_size, font_color):
+        return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, font_size, font_color, add_sub=True,
+                                        dest_spk=video_spk_input)
+
+
+    top_md_1 = ("""
+     **<font color="#1785c4"></font>** 
+    <div align="center">
+    <div style="display:flex; gap: 0.25rem;" align="center"> 
+    </div>
+    </div>
+    """)
+
+    top_md_2 = ("""
+    <div align="center">
+    <div style="display:flex; gap: 0.25rem;" align="center">
+    </div>
+    </div>
+    """)
+    
+    top_md_3 = ("""
+    * Step1: 上传视频或音频文件（或使用下方的用例体验），点击 **<font color="#f7802b">识别</font>** 按钮
+    * Step2: 复制识别结果中所需的文字至右上方，或者右设置说话人标识，设置偏移与字幕配置（可选）
+    * Step3: 点击 **<font color="#f7802b">裁剪</font>** 按钮或 **<font color="#f7802b">裁剪并添加字幕</font>** 按钮获得结果
+    """)
+
+    # gradio interface
+    with gr.Blocks() as demo:
+        # gr.Image("./examples/guide.png", show_label=False)
+        gr.Markdown(top_md_1)
+        gr.Markdown(top_md_2)
+        gr.Markdown(top_md_3)
+        video_state = gr.State()
+        audio_state = gr.State()
+        with gr.Tab("🎥✂️视频裁剪 Video Clipping"):
+            with gr.Row():
+                with gr.Column():
+                    video_input = gr.Video(label="🎥视频输入 Video Input")
+                    with gr.Row():
+                        video_sd_switch = gr.Radio(["no", "yes"], label="👥是否区分说话人 Recognize Speakers",
+                                                   value='no')
+                        hotwords_input = gr.Textbox(label="🚒热词 Hotwords")
+                    recog_button2 = gr.Button("👂识别 Recognize")
+                    video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
+                    video_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
+                with gr.Column():
+                    video_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
+                    video_spk_input = gr.Textbox(label="✏️待裁剪说话人 Speaker to Clip (多个说话人使用'#'连接)")
+                    with gr.Row():
+                        video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50,
+                                                    label="⏪开始位置偏移 Start Offset (ms)")
+                        video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50,
+                                                  label="⏩结束位置偏移 End Offset (ms)")
+                    with gr.Row():
+                        font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2,
+                                              label="🔠字幕字体大小 Subtitle Font Size")
+                        font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color",
+                                              value='white')
+                        # font = gr.Radio(["黑体", "Alibaba Sans"], label="字体 Font")
+                    with gr.Row():
+                        clip_button2 = gr.Button("✂️裁剪\nClip")
+                        clip_button3 = gr.Button("✂️裁剪并添加字幕\nClip and Generate Subtitles")
+                    video_output = gr.Video(label="🎥裁剪结果 Audio Clipped")
+                    video_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
+                    video_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
+
+        with gr.Tab("🔊✂️音频裁剪 Audio Clipping"):
+            with gr.Row():
+                with gr.Column():
+                    audio_input = gr.Audio(label="🔊音频输入 Audio Input")
+                    with gr.Row():
+                        audio_sd_switch = gr.Radio(["no", "yes"], label="👥是否区分说话人 Recognize Speakers",
+                                                   value='no')
+                        hotwords_input2 = gr.Textbox(label="🚒热词 Hotwords")
+                    recog_button1 = gr.Button("👂识别 Recognize")
+                    audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
+                    audio_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
+                with gr.Column():
+                    audio_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
+                    audio_spk_input = gr.Textbox(label="✏️待裁剪说话人 Speaker to Clip (多个说话人使用'#'连接)")
+                    with gr.Row():
+                        audio_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50,
+                                                    label="⏪开始位置偏移 Start Offset (ms)")
+                        audio_end_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50,
+                                                  label="⏩结束位置偏移 End Offset (ms)")
+                    with gr.Row():
+                        clip_button1 = gr.Button("✂️裁剪 Clip")
+                    audio_output = gr.Audio(label="🔊裁剪结果 Audio Clipped")
+                    audio_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
+                    audio_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
+
+        recog_button1.click(audio_recog,
+                            inputs=[audio_input, audio_sd_switch, hotwords_input2],
+                            outputs=[audio_text_output, audio_srt_output, audio_state])
+        clip_button1.click(audio_clip,
+                           inputs=[audio_text_input, audio_spk_input, audio_start_ost, audio_end_ost, audio_state],
+                           outputs=[audio_output, audio_mess_output, audio_srt_clip_output])
+        
+        recog_button2.click(video_recog,
+                            inputs=[video_input, video_sd_switch, hotwords_input],
+                            outputs=[video_text_output, video_srt_output, video_state])
+        clip_button2.click(video_clip,
+                           inputs=[video_text_input, video_spk_input, video_start_ost, video_end_ost, video_state],
+                           outputs=[video_output, video_mess_output, video_srt_clip_output])
+        clip_button3.click(video_clip_addsub,
+                           inputs=[video_text_input, video_spk_input, video_start_ost, video_end_ost, video_state,
+                                   font_size, font_color],
+                           outputs=[video_output, video_mess_output, video_srt_clip_output])
+
+    # start gradio service in local
+    demo.launch()
--- a/funclip/subtitle_utils.py 0 → 100644
View file @3de5c1e
+++ b/funclip/subtitle_utils.py 0 → 100644
View file @3de5c1e
+def time_convert(ms):
+    ms = int(ms)
+    tail = ms % 1000
+    s = ms // 1000
+    mi = s // 60
+    s = s % 60
+    h = mi // 60
+    mi = mi % 60
+    h = "00" if h == 0 else str(h)
+    mi = "00" if mi == 0 else str(mi)
+    s = "00" if s == 0 else str(s)
+    tail = str(tail)
+    if len(h) == 1: h = '0' + h
+    if len(mi) == 1: mi = '0' + mi
+    if len(s) == 1: s = '0' + s
+    return "{}:{}:{},{}".format(h, mi, s, tail)
+
+
+class Text2SRT():
+    def __init__(self, text, timestamp, offset=0):
+        self.token_list = [i for i in text.split() if len(i)]
+        self.timestamp = timestamp
+        start, end = timestamp[0][0] - offset, timestamp[-1][1] - offset
+        self.start_sec, self.end_sec = start, end
+        self.start_time = time_convert(start)
+        self.end_time = time_convert(end)
+    def text(self):
+        res = ""
+        for word in self.token_list:
+            if '\u4e00' <= word <= '\u9fff':
+                res += word
+            else:
+                res += " " + word
+        return res
+    def len(self):
+        return len(self.token_list)
+    def srt(self, acc_ost=0.0):
+        return "{} --> {}\n{}\n".format(
+            time_convert(self.start_sec+acc_ost*1000),
+            time_convert(self.end_sec+acc_ost*1000), 
+            self.text())
+    def time(self, acc_ost=0.0):
+        return (self.start_sec/1000+acc_ost, self.end_sec/1000+acc_ost)
+
+
+def generate_srt(sentence_list):
+    srt_total = ''
+    for i, d in enumerate(sentence_list):
+        t2s = Text2SRT(d['text'], d['timestamp'])
+        if 'spk' in d:
+            srt_total += "{}  spk{}\n{}".format(i, d['spk'], t2s.srt())
+        else:
+            srt_total += "{}\n{}".format(i, t2s.srt())
+    return srt_total
+
+def generate_srt_clip(sentence_list, start, end, begin_index=0, time_acc_ost=0.0):
+    start, end = int(start * 1000), int(end * 1000)
+    srt_total = ''
+    cc = 1 + begin_index
+    subs = []
+    for i, d in enumerate(sentence_list):
+        if d['timestamp'][-1][1] <= start:
+            continue
+        if d['timestamp'][0][0] >= end:
+            break
+        # parts in between
+        if (d['timestamp'][-1][1] <= end and d['timestamp'][0][0] > start) or (d['timestamp'][-1][1] == end and d['timestamp'][0][0] == start):
+            t2s = Text2SRT(d['text'], d['timestamp'], offset=start)
+            srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
+            subs.append((t2s.time(time_acc_ost), t2s.text()))
+            cc += 1
+            continue
+        if d['timestamp'][0][0] <= start:
+            if not d['timestamp'][-1][1] > end:
+                for j, ts in enumerate(d['timestamp']):
+                    if ts[1] > start:
+                        break
+                _text = " ".join(d['text'].split()[j:])
+                _ts = d['timestamp'][j:]
+            else:
+                for j, ts in enumerate(d['timestamp']):
+                    if ts[1] > start:
+                        _start = j
+                        break
+                for j, ts in enumerate(d['timestamp']):
+                    if ts[1] > end:
+                        _end = j
+                        break
+                _text = " ".join(d['text'].split()[_start:_end])
+                _ts = d['timestamp'][_start:_end]
+            if len(ts):
+                t2s = Text2SRT(_text, _ts, offset=start)
+                srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
+                subs.append((t2s.time(time_acc_ost), t2s.text()))
+                cc += 1
+            continue
+        if d['timestamp'][-1][1] > end:
+            for j, ts in enumerate(d['timestamp']):
+                if ts[1] > end:
+                    break
+            _text = " ".join(d['text'].split()[:j])
+            _ts = d['timestamp'][:j]
+            if len(_ts):
+                t2s = Text2SRT(_text, _ts, offset=start)
+                srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
+                subs.append(
+                    (t2s.time(time_acc_ost), t2s.text())
+                    )
+                cc += 1
+            continue
+    return srt_total, subs, cc
--- a/funclip/test.sh 0 → 100644
View file @3de5c1e
+++ b/funclip/test.sh 0 → 100644
View file @3de5c1e
+# step1: Recognize
+python videoclipper.py --stage 1 \
+                       --file ../examples/2022云栖大会_片段.mp4 \
+                       --sd_switch yes \
+                       --output_dir ./output
+# now you can find recognition results and entire SRT file in ./output/
+# step2: Clip
+python videoclipper.py --stage 2 \
+                       --file ../examples/2022云栖大会_片段.mp4 \
+                       --output_dir ./output \
+                       --dest_text '所以这个是我们办这个奖的初心啊，我们也会一届一届的办下去' \
+                    #    --dest_spk spk0 \
+                       --start_ost 0 \
+                       --end_ost 100 \
+                       --output_file './output/res.mp4'
\ No newline at end of file
--- a/funclip/trans_utils.py 0 → 100644
View file @3de5c1e
+++ b/funclip/trans_utils.py 0 → 100644
View file @3de5c1e
+PUNC_LIST = ['，', '。', '！', '？', '、']
+
+
+def pre_proc(text):
+    res = ''
+    for i in range(len(text)):
+        if text[i] in PUNC_LIST:
+            continue
+        if '\u4e00' <= text[i] <= '\u9fff':
+            if len(res) and res[-1] != " ":
+                res += ' ' + text[i]+' '
+            else:
+                res += text[i]+' '
+        else:
+            res += text[i]
+    if res[-1] == ' ':
+        res = res[:-1]
+    return res
+
+def proc(raw_text, timestamp, dest_text):
+    # simple matching
+    ld = len(dest_text.split())
+    mi, ts = [], []
+    offset = 0
+    while True:
+        fi = raw_text.find(dest_text, offset, len(raw_text))
+        # import pdb; pdb.set_trace()
+        ti = raw_text[:fi].count(' ')
+        if fi == -1:
+            break
+        offset = fi + ld
+        mi.append(fi)
+        ts.append([timestamp[ti][0]*16, timestamp[ti+ld-1][1]*16])
+        # import pdb; pdb.set_trace()
+    return ts
+
+def proc_spk(dest_spk, sd_sentences):
+    ts = []
+    for d in sd_sentences:
+        d_start = d['timestamp'][0][0]
+        d_end = d['timestamp'][-1][1]
+        spkid=dest_spk[3:]
+        # import pdb; pdb.set_trace()
+        if str(d['spk']) == spkid and d_end-d_start>999:
+            ts.append([d['start']*16, d['end']*16])
+    return ts
+
+def generate_vad_data(data, sd_sentences, sr=16000):
+    assert len(data.shape) == 1
+    vad_data = []
+    for d in sd_sentences:
+        d_start = round(d['ts_list'][0][0]/1000, 2)
+        d_end = round(d['ts_list'][-1][1]/1000, 2)
+        vad_data.append([d_start, d_end, data[int(d_start * sr):int(d_end * sr)]])
+    return vad_data
+
+def write_state(output_dir, state):
+    for key in ['/recog_res_raw', '/timestamp', '/sentences']:#, '/sd_sentences']:
+        with open(output_dir+key, 'w') as fout:
+            fout.write(str(state[key[1:]]))
+    if 'sd_sentences' in state:
+        with open(output_dir+'/sd_sentences', 'w') as fout:
+            fout.write(str(state['sd_sentences']))
+
+import os
+def load_state(output_dir):
+    state = {}
+    with open(output_dir+'/recog_res_raw') as fin:
+        line = fin.read()
+        state['recog_res_raw'] = line
+    with open(output_dir+'/timestamp') as fin:
+        line = fin.read()
+        state['timestamp'] = eval(line)
+    with open(output_dir+'/sentences') as fin:
+        line = fin.read()
+        state['sentences'] = eval(line)
+    if os.path.exists(output_dir+'/sd_sentences'):
+        with open(output_dir+'/sd_sentences') as fin:
+            line = fin.read()
+            state['sd_sentences'] = eval(line)
+    return state
+
+import numpy as np       
+def convert_pcm_to_float(data):
+    if data.dtype == np.float64:
+        return data
+    elif data.dtype == np.float32:
+        return data.astype(np.float64)
+    elif data.dtype == np.int16:
+        bit_depth = 16
+    elif data.dtype == np.int32:
+        bit_depth = 32
+    elif data.dtype == np.int8:
+        bit_depth = 8
+    else:
+        raise ValueError("Unsupported audio data type")
+
+    # Now handle the integer types
+    max_int_value = float(2 ** (bit_depth - 1))
+    if bit_depth == 8:
+        data = data - 128
+    return (data.astype(np.float64) / max_int_value)
+
+
+    
\ No newline at end of file
--- a/funclip/videoclipper.py 0 → 100644
View file @3de5c1e
+++ b/funclip/videoclipper.py 0 → 100644
View file @3de5c1e