Compare commits
No commits in common. "0416a9017cfbbfff8a308cafc6afe872a5a7413a" and "380bf27beacb11ac87bc6ed91aea895f7a7c9d73" have entirely different histories.
0416a9017c
...
380bf27bea
33
.gitattributes
vendored
33
.gitattributes
vendored
@ -1,52 +1,22 @@
|
|||||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
<<<<<<< HEAD
|
|
||||||
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
|
||||||
=======
|
|
||||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
>>>>>>> 380bf27beacb11ac87bc6ed91aea895f7a7c9d73
|
|
||||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
<<<<<<< HEAD
|
|
||||||
*.model filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
|
||||||
=======
|
|
||||||
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
*.model filter=lfs diff=lfs merge=lfs -text
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
*.npy filter=lfs diff=lfs merge=lfs -text
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
*.npz filter=lfs diff=lfs merge=lfs -text
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
>>>>>>> 380bf27beacb11ac87bc6ed91aea895f7a7c9d73
|
|
||||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
<<<<<<< HEAD
|
|
||||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.rar filter=lfs diff=lfs merge=lfs -text
|
|
||||||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.db* filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.ark* filter=lfs diff=lfs merge=lfs -text
|
|
||||||
**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
|
|
||||||
**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
|
|
||||||
**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
|
||||||
=======
|
|
||||||
*.pickle filter=lfs diff=lfs merge=lfs -text
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
*.pkl filter=lfs diff=lfs merge=lfs -text
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
@ -63,5 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|||||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
*.EncryptBy4pd filter=lfs diff=lfs merge=lfs -text
|
*.EncryptBy4pd filter=lfs diff=lfs merge=lfs -text
|
||||||
>>>>>>> 380bf27beacb11ac87bc6ed91aea895f7a7c9d73
|
|
||||||
318
README.md
318
README.md
@ -1,318 +0,0 @@
|
|||||||
---
|
|
||||||
tasks:
|
|
||||||
- auto-speech-recognition
|
|
||||||
domain:
|
|
||||||
- audio
|
|
||||||
model-type:
|
|
||||||
- autoregressive
|
|
||||||
frameworks:
|
|
||||||
- pytorch
|
|
||||||
backbone:
|
|
||||||
- transformer/conformer
|
|
||||||
metrics:
|
|
||||||
- CER
|
|
||||||
license: Apache License 2.0
|
|
||||||
language:
|
|
||||||
- multilingual
|
|
||||||
tags:
|
|
||||||
- FunASR
|
|
||||||
- Whisper
|
|
||||||
datasets:
|
|
||||||
train:
|
|
||||||
- 680,000 hour
|
|
||||||
test:
|
|
||||||
- test
|
|
||||||
indexing:
|
|
||||||
results:
|
|
||||||
- task:
|
|
||||||
name: Automatic Speech Recognition
|
|
||||||
dataset:
|
|
||||||
name: 680,000 hour
|
|
||||||
metrics:
|
|
||||||
- type: CER
|
|
||||||
value: 8.53% # float
|
|
||||||
description: greedy search, withou lm, avg.
|
|
||||||
args: default
|
|
||||||
- type: RTF
|
|
||||||
value: 0.0251 # float
|
|
||||||
description: GPU inference on V100
|
|
||||||
args: batch_size=1
|
|
||||||
widgets:
|
|
||||||
- task: auto-speech-recognition
|
|
||||||
model_revision: v2.0.5
|
|
||||||
inputs:
|
|
||||||
- type: audio
|
|
||||||
name: input
|
|
||||||
title: 音频
|
|
||||||
examples:
|
|
||||||
- name: 1
|
|
||||||
title: 示例1
|
|
||||||
inputs:
|
|
||||||
- name: input
|
|
||||||
data: git://example/asr_example.wav
|
|
||||||
inferencespec:
|
|
||||||
cpu: 8 #CPU数量
|
|
||||||
memory: 4096
|
|
||||||
---
|
|
||||||
|
|
||||||
|
|
||||||
# Whisper模型介绍
|
|
||||||
|
|
||||||
## <strong>[ModelScope-FunASR](https://github.com/alibaba-damo-academy/FunASR)</strong>
|
|
||||||
<strong>[FunASR](https://github.com/alibaba-damo-academy/FunASR)</strong>希望在语音识别方面建立学术研究和工业应用之间的桥梁。通过支持在ModelScope上发布的工业级语音识别模型的训练和微调,研究人员和开发人员可以更方便地进行语音识别模型的研究和生产,并促进语音识别生态系统的发展。
|
|
||||||
|
|
||||||
[**最新动态**](https://github.com/alibaba-damo-academy/FunASR#whats-new)
|
|
||||||
| [**环境安装**](https://github.com/alibaba-damo-academy/FunASR#installation)
|
|
||||||
| [**介绍文档**](https://alibaba-damo-academy.github.io/FunASR/en/index.html)
|
|
||||||
| [**中文教程**](https://github.com/alibaba-damo-academy/FunASR/wiki#funasr%E7%94%A8%E6%88%B7%E6%89%8B%E5%86%8C)
|
|
||||||
| [**服务部署**](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime)
|
|
||||||
| [**模型库**](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md)
|
|
||||||
| [**联系我们**](https://github.com/alibaba-damo-academy/FunASR#contact)
|
|
||||||
|
|
||||||
|
|
||||||
## 基于ModelScope进行推理
|
|
||||||
|
|
||||||
- 推理支持音频格式如下:
|
|
||||||
- wav文件路径,例如:data/test/audios/asr_example.wav
|
|
||||||
- pcm文件路径,例如:data/test/audios/asr_example.pcm
|
|
||||||
- wav文件url,例如:https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav
|
|
||||||
- wav二进制数据,格式bytes,例如:用户直接从文件里读出bytes数据或者是麦克风录出bytes数据。
|
|
||||||
- 已解析的audio音频,例如:audio, rate = soundfile.read("asr_example_zh.wav"),类型为numpy.ndarray或者torch.Tensor。
|
|
||||||
- wav.scp文件,需符合如下要求:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
cat wav.scp
|
|
||||||
asr_example1 data/test/audios/asr_example1.wav
|
|
||||||
asr_example2 data/test/audios/asr_example2.wav
|
|
||||||
...
|
|
||||||
```
|
|
||||||
|
|
||||||
- 若输入格式wav文件url,api调用方式可参考如下范例:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from modelscope.pipelines import pipeline
|
|
||||||
from modelscope.utils.constant import Tasks
|
|
||||||
|
|
||||||
inference_pipeline = pipeline(
|
|
||||||
task=Tasks.auto_speech_recognition,
|
|
||||||
model='iic/Whisper-large-v3', model_revision="v2.0.5")
|
|
||||||
|
|
||||||
rec_result = inference_pipeline(input='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav', language=None)
|
|
||||||
print(rec_result)
|
|
||||||
```
|
|
||||||
|
|
||||||
- 输入音频为pcm格式,调用api时需要传入音频采样率参数fs,例如:
|
|
||||||
|
|
||||||
```python
|
|
||||||
rec_result = inference_pipeline(input='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.pcm', fs=16000)
|
|
||||||
```
|
|
||||||
|
|
||||||
- 输入音频为wav格式,api调用方式可参考如下范例:
|
|
||||||
|
|
||||||
```python
|
|
||||||
rec_result = inference_pipeline(input'asr_example_zh.wav')
|
|
||||||
```
|
|
||||||
|
|
||||||
- 若输入格式为文件wav.scp(注:文件名需要以.scp结尾),可添加 output_dir 参数将识别结果写入文件中,api调用方式可参考如下范例:
|
|
||||||
|
|
||||||
```python
|
|
||||||
inference_pipeline(input="wav.scp", output_dir='./output_dir')
|
|
||||||
```
|
|
||||||
识别结果输出路径结构如下:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
tree output_dir/
|
|
||||||
output_dir/
|
|
||||||
└── 1best_recog
|
|
||||||
├── score
|
|
||||||
└── text
|
|
||||||
|
|
||||||
1 directory, 3 files
|
|
||||||
```
|
|
||||||
score:识别路径得分
|
|
||||||
|
|
||||||
text:语音识别结果文件
|
|
||||||
|
|
||||||
|
|
||||||
- 若输入音频为已解析的audio音频,api调用方式可参考如下范例:
|
|
||||||
|
|
||||||
```python
|
|
||||||
import soundfile
|
|
||||||
|
|
||||||
waveform, sample_rate = soundfile.read("asr_example_zh.wav")
|
|
||||||
rec_result = inference_pipeline(input=waveform)
|
|
||||||
```
|
|
||||||
|
|
||||||
- ASR、VAD、PUNC模型自由组合
|
|
||||||
|
|
||||||
可根据使用需求对VAD和PUNC标点模型进行自由组合,使用方式如下:
|
|
||||||
```python
|
|
||||||
inference_pipeline = pipeline(
|
|
||||||
task=Tasks.auto_speech_recognition,
|
|
||||||
model='iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch', model_revision="v2.0.4",
|
|
||||||
vad_model='iic/speech_fsmn_vad_zh-cn-16k-common-pytorch', vad_model_revision="v2.0.4",
|
|
||||||
punc_model='iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch', punc_model_revision="v2.0.4",
|
|
||||||
# spk_model="iic/speech_campplus_sv_zh-cn_16k-common",
|
|
||||||
# spk_model_revision="v2.0.2",
|
|
||||||
)
|
|
||||||
```
|
|
||||||
若不使用PUNC模型,可配置punc_model="",或不传入punc_model参数,如需加入LM模型,可增加配置lm_model='damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch',并设置lm_weight和beam_size参数。
|
|
||||||
|
|
||||||
## 基于FunASR进行推理
|
|
||||||
|
|
||||||
下面为快速上手教程,测试音频([中文](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav),[英文](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav))
|
|
||||||
|
|
||||||
### 可执行命令行
|
|
||||||
在命令行终端执行:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=vad_example.wav
|
|
||||||
```
|
|
||||||
|
|
||||||
注:支持单条音频文件识别,也支持文件列表,列表为kaldi风格wav.scp:`wav_id wav_path`
|
|
||||||
|
|
||||||
### python示例
|
|
||||||
#### 非实时语音识别
|
|
||||||
```python
|
|
||||||
from funasr import AutoModel
|
|
||||||
# paraformer-zh is a multi-functional asr model
|
|
||||||
# use vad, punc, spk or not as you need
|
|
||||||
model = AutoModel(model="paraformer-zh", model_revision="v2.0.4",
|
|
||||||
vad_model="fsmn-vad", vad_model_revision="v2.0.4",
|
|
||||||
punc_model="ct-punc-c", punc_model_revision="v2.0.4",
|
|
||||||
# spk_model="cam++", spk_model_revision="v2.0.2",
|
|
||||||
)
|
|
||||||
res = model.generate(input=f"{model.model_path}/example/asr_example.wav",
|
|
||||||
batch_size_s=300,
|
|
||||||
hotword='魔搭')
|
|
||||||
print(res)
|
|
||||||
```
|
|
||||||
注:`model_hub`:表示模型仓库,`ms`为选择modelscope下载,`hf`为选择huggingface下载。
|
|
||||||
|
|
||||||
#### 实时语音识别
|
|
||||||
|
|
||||||
```python
|
|
||||||
from funasr import AutoModel
|
|
||||||
|
|
||||||
chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
|
|
||||||
encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
|
|
||||||
decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
|
|
||||||
|
|
||||||
model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.4")
|
|
||||||
|
|
||||||
import soundfile
|
|
||||||
import os
|
|
||||||
|
|
||||||
wav_file = os.path.join(model.model_path, "example/asr_example.wav")
|
|
||||||
speech, sample_rate = soundfile.read(wav_file)
|
|
||||||
chunk_stride = chunk_size[1] * 960 # 600ms
|
|
||||||
|
|
||||||
cache = {}
|
|
||||||
total_chunk_num = int(len((speech)-1)/chunk_stride+1)
|
|
||||||
for i in range(total_chunk_num):
|
|
||||||
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
|
|
||||||
is_final = i == total_chunk_num - 1
|
|
||||||
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
|
|
||||||
print(res)
|
|
||||||
```
|
|
||||||
|
|
||||||
注:`chunk_size`为流式延时配置,`[0,10,5]`表示上屏实时出字粒度为`10*60=600ms`,未来信息为`5*60=300ms`。每次推理输入为`600ms`(采样点数为`16000*0.6=960`),输出为对应文字,最后一个语音片段输入需要设置`is_final=True`来强制输出最后一个字。
|
|
||||||
|
|
||||||
#### 语音端点检测(非实时)
|
|
||||||
```python
|
|
||||||
from funasr import AutoModel
|
|
||||||
|
|
||||||
model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
|
|
||||||
|
|
||||||
wav_file = f"{model.model_path}/example/asr_example.wav"
|
|
||||||
res = model.generate(input=wav_file)
|
|
||||||
print(res)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 语音端点检测(实时)
|
|
||||||
```python
|
|
||||||
from funasr import AutoModel
|
|
||||||
|
|
||||||
chunk_size = 200 # ms
|
|
||||||
model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
|
|
||||||
|
|
||||||
import soundfile
|
|
||||||
|
|
||||||
wav_file = f"{model.model_path}/example/vad_example.wav"
|
|
||||||
speech, sample_rate = soundfile.read(wav_file)
|
|
||||||
chunk_stride = int(chunk_size * sample_rate / 1000)
|
|
||||||
|
|
||||||
cache = {}
|
|
||||||
total_chunk_num = int(len((speech)-1)/chunk_stride+1)
|
|
||||||
for i in range(total_chunk_num):
|
|
||||||
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
|
|
||||||
is_final = i == total_chunk_num - 1
|
|
||||||
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
|
|
||||||
if len(res[0]["value"]):
|
|
||||||
print(res)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 标点恢复
|
|
||||||
```python
|
|
||||||
from funasr import AutoModel
|
|
||||||
|
|
||||||
model = AutoModel(model="ct-punc", model_revision="v2.0.4")
|
|
||||||
|
|
||||||
res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
|
|
||||||
print(res)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 时间戳预测
|
|
||||||
```python
|
|
||||||
from funasr import AutoModel
|
|
||||||
|
|
||||||
model = AutoModel(model="fa-zh", model_revision="v2.0.4")
|
|
||||||
|
|
||||||
wav_file = f"{model.model_path}/example/asr_example.wav"
|
|
||||||
text_file = f"{model.model_path}/example/text.txt"
|
|
||||||
res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
|
|
||||||
print(res)
|
|
||||||
```
|
|
||||||
|
|
||||||
更多详细用法([示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining))
|
|
||||||
|
|
||||||
|
|
||||||
## 微调
|
|
||||||
|
|
||||||
详细用法([示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining))
|
|
||||||
|
|
||||||
## 使用方式以及适用范围
|
|
||||||
|
|
||||||
运行范围
|
|
||||||
- 支持Linux-x86_64、Mac和Windows运行。
|
|
||||||
|
|
||||||
使用方式
|
|
||||||
- 直接推理:可以直接对输入音频进行解码,输出目标文字。
|
|
||||||
|
|
||||||
使用范围与目标场景
|
|
||||||
- 适合于离线语音识别场景
|
|
||||||
|
|
||||||
|
|
||||||
## 模型局限性以及可能的偏差
|
|
||||||
|
|
||||||
考虑到特征提取流程和工具以及训练工具差异,会对CER的数据带来一定的差异(<0.1%),推理GPU环境差异导致的RTF数值差异。
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## 相关论文以及引用信息
|
|
||||||
|
|
||||||
```BibTeX
|
|
||||||
@inproceedings{radford2023robust,
|
|
||||||
title={Robust speech recognition via large-scale weak supervision},
|
|
||||||
author={Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya},
|
|
||||||
booktitle={International Conference on Machine Learning},
|
|
||||||
pages={28492--28518},
|
|
||||||
year={2023},
|
|
||||||
organization={PMLR}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
38
config.yaml
38
config.yaml
@ -1,38 +0,0 @@
|
|||||||
# network architecture
|
|
||||||
model: WhisperWarp
|
|
||||||
model_conf:
|
|
||||||
lsm_weight: 0.1
|
|
||||||
length_normalized_loss: true
|
|
||||||
hub: funasr # openai
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# only use for hub == funasr,
|
|
||||||
# if hub == openai, dims is automaticall download
|
|
||||||
dims:
|
|
||||||
n_mels: 128
|
|
||||||
n_vocab: 51866
|
|
||||||
n_audio_ctx: 1500
|
|
||||||
n_audio_state: 1280
|
|
||||||
n_audio_head: 20
|
|
||||||
n_audio_layer: 32
|
|
||||||
n_text_ctx: 448
|
|
||||||
n_text_state: 1280
|
|
||||||
n_text_head: 20
|
|
||||||
n_text_layer: 32
|
|
||||||
|
|
||||||
# frontend related
|
|
||||||
frontend: WhisperFrontend
|
|
||||||
frontend_conf:
|
|
||||||
fs: 16000
|
|
||||||
n_mels: ${dims.n_mels}
|
|
||||||
do_pad_trim: true
|
|
||||||
|
|
||||||
tokenizer: WhisperTokenizer
|
|
||||||
tokenizer_conf:
|
|
||||||
language: null
|
|
||||||
task: transcribe
|
|
||||||
is_multilingual: true
|
|
||||||
num_languages: 100
|
|
||||||
|
|
||||||
scope_map: [none, "model."]
|
|
||||||
@ -1,12 +0,0 @@
|
|||||||
{
|
|
||||||
"framework": "pytorch",
|
|
||||||
"task" : "auto-speech-recognition",
|
|
||||||
"model": {"type" : "funasr"},
|
|
||||||
"pipeline": {"type":"funasr-pipeline"},
|
|
||||||
"model_name_in_hub": {
|
|
||||||
"ms":"",
|
|
||||||
"hf":""},
|
|
||||||
"file_path_metas": {
|
|
||||||
"init_param":"large-v3.pt",
|
|
||||||
"config":"config.yaml"}
|
|
||||||
}
|
|
||||||
Binary file not shown.
BIN
large-v3.pt
(Stored with Git LFS)
BIN
large-v3.pt
(Stored with Git LFS)
Binary file not shown.
@ -1 +0,0 @@
|
|||||||
openai-whisper
|
|
||||||
Loading…
Reference in New Issue
Block a user