Compare commits
10 Commits
380bf27bea
...
0416a9017c
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0416a9017c | ||
|
|
29757107a1 | ||
|
|
53b03d051c | ||
|
|
781dd70f93 | ||
|
|
53611869c3 | ||
|
|
5c11e5ed76 | ||
|
|
2fdda1eeea | ||
|
|
987d045d09 | ||
|
|
ff160a608a | ||
|
|
f2768f633f |
33
.gitattributes
vendored
33
.gitattributes
vendored
@ -1,22 +1,52 @@
|
|||||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
<<<<<<< HEAD
|
||||||
|
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
=======
|
||||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
>>>>>>> 380bf27beacb11ac87bc6ed91aea895f7a7c9d73
|
||||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
<<<<<<< HEAD
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
=======
|
||||||
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
*.model filter=lfs diff=lfs merge=lfs -text
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
*.npy filter=lfs diff=lfs merge=lfs -text
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
*.npz filter=lfs diff=lfs merge=lfs -text
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
>>>>>>> 380bf27beacb11ac87bc6ed91aea895f7a7c9d73
|
||||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
<<<<<<< HEAD
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.db* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ark* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
|
||||||
|
**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
=======
|
||||||
*.pickle filter=lfs diff=lfs merge=lfs -text
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
*.pkl filter=lfs diff=lfs merge=lfs -text
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
@ -33,4 +63,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|||||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
*.EncryptBy4pd filter=lfs diff=lfs merge=lfs -text
|
*.EncryptBy4pd filter=lfs diff=lfs merge=lfs -text
|
||||||
|
>>>>>>> 380bf27beacb11ac87bc6ed91aea895f7a7c9d73
|
||||||
|
|||||||
318
README.md
Normal file
318
README.md
Normal file
@ -0,0 +1,318 @@
|
|||||||
|
---
|
||||||
|
tasks:
|
||||||
|
- auto-speech-recognition
|
||||||
|
domain:
|
||||||
|
- audio
|
||||||
|
model-type:
|
||||||
|
- autoregressive
|
||||||
|
frameworks:
|
||||||
|
- pytorch
|
||||||
|
backbone:
|
||||||
|
- transformer/conformer
|
||||||
|
metrics:
|
||||||
|
- CER
|
||||||
|
license: Apache License 2.0
|
||||||
|
language:
|
||||||
|
- multilingual
|
||||||
|
tags:
|
||||||
|
- FunASR
|
||||||
|
- Whisper
|
||||||
|
datasets:
|
||||||
|
train:
|
||||||
|
- 680,000 hour
|
||||||
|
test:
|
||||||
|
- test
|
||||||
|
indexing:
|
||||||
|
results:
|
||||||
|
- task:
|
||||||
|
name: Automatic Speech Recognition
|
||||||
|
dataset:
|
||||||
|
name: 680,000 hour
|
||||||
|
metrics:
|
||||||
|
- type: CER
|
||||||
|
value: 8.53% # float
|
||||||
|
description: greedy search, withou lm, avg.
|
||||||
|
args: default
|
||||||
|
- type: RTF
|
||||||
|
value: 0.0251 # float
|
||||||
|
description: GPU inference on V100
|
||||||
|
args: batch_size=1
|
||||||
|
widgets:
|
||||||
|
- task: auto-speech-recognition
|
||||||
|
model_revision: v2.0.5
|
||||||
|
inputs:
|
||||||
|
- type: audio
|
||||||
|
name: input
|
||||||
|
title: 音频
|
||||||
|
examples:
|
||||||
|
- name: 1
|
||||||
|
title: 示例1
|
||||||
|
inputs:
|
||||||
|
- name: input
|
||||||
|
data: git://example/asr_example.wav
|
||||||
|
inferencespec:
|
||||||
|
cpu: 8 #CPU数量
|
||||||
|
memory: 4096
|
||||||
|
---
|
||||||
|
|
||||||
|
|
||||||
|
# Whisper模型介绍
|
||||||
|
|
||||||
|
## <strong>[ModelScope-FunASR](https://github.com/alibaba-damo-academy/FunASR)</strong>
|
||||||
|
<strong>[FunASR](https://github.com/alibaba-damo-academy/FunASR)</strong>希望在语音识别方面建立学术研究和工业应用之间的桥梁。通过支持在ModelScope上发布的工业级语音识别模型的训练和微调,研究人员和开发人员可以更方便地进行语音识别模型的研究和生产,并促进语音识别生态系统的发展。
|
||||||
|
|
||||||
|
[**最新动态**](https://github.com/alibaba-damo-academy/FunASR#whats-new)
|
||||||
|
| [**环境安装**](https://github.com/alibaba-damo-academy/FunASR#installation)
|
||||||
|
| [**介绍文档**](https://alibaba-damo-academy.github.io/FunASR/en/index.html)
|
||||||
|
| [**中文教程**](https://github.com/alibaba-damo-academy/FunASR/wiki#funasr%E7%94%A8%E6%88%B7%E6%89%8B%E5%86%8C)
|
||||||
|
| [**服务部署**](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime)
|
||||||
|
| [**模型库**](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md)
|
||||||
|
| [**联系我们**](https://github.com/alibaba-damo-academy/FunASR#contact)
|
||||||
|
|
||||||
|
|
||||||
|
## 基于ModelScope进行推理
|
||||||
|
|
||||||
|
- 推理支持音频格式如下:
|
||||||
|
- wav文件路径,例如:data/test/audios/asr_example.wav
|
||||||
|
- pcm文件路径,例如:data/test/audios/asr_example.pcm
|
||||||
|
- wav文件url,例如:https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav
|
||||||
|
- wav二进制数据,格式bytes,例如:用户直接从文件里读出bytes数据或者是麦克风录出bytes数据。
|
||||||
|
- 已解析的audio音频,例如:audio, rate = soundfile.read("asr_example_zh.wav"),类型为numpy.ndarray或者torch.Tensor。
|
||||||
|
- wav.scp文件,需符合如下要求:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
cat wav.scp
|
||||||
|
asr_example1 data/test/audios/asr_example1.wav
|
||||||
|
asr_example2 data/test/audios/asr_example2.wav
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
- 若输入格式wav文件url,api调用方式可参考如下范例:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from modelscope.pipelines import pipeline
|
||||||
|
from modelscope.utils.constant import Tasks
|
||||||
|
|
||||||
|
inference_pipeline = pipeline(
|
||||||
|
task=Tasks.auto_speech_recognition,
|
||||||
|
model='iic/Whisper-large-v3', model_revision="v2.0.5")
|
||||||
|
|
||||||
|
rec_result = inference_pipeline(input='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav', language=None)
|
||||||
|
print(rec_result)
|
||||||
|
```
|
||||||
|
|
||||||
|
- 输入音频为pcm格式,调用api时需要传入音频采样率参数fs,例如:
|
||||||
|
|
||||||
|
```python
|
||||||
|
rec_result = inference_pipeline(input='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.pcm', fs=16000)
|
||||||
|
```
|
||||||
|
|
||||||
|
- 输入音频为wav格式,api调用方式可参考如下范例:
|
||||||
|
|
||||||
|
```python
|
||||||
|
rec_result = inference_pipeline(input'asr_example_zh.wav')
|
||||||
|
```
|
||||||
|
|
||||||
|
- 若输入格式为文件wav.scp(注:文件名需要以.scp结尾),可添加 output_dir 参数将识别结果写入文件中,api调用方式可参考如下范例:
|
||||||
|
|
||||||
|
```python
|
||||||
|
inference_pipeline(input="wav.scp", output_dir='./output_dir')
|
||||||
|
```
|
||||||
|
识别结果输出路径结构如下:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
tree output_dir/
|
||||||
|
output_dir/
|
||||||
|
└── 1best_recog
|
||||||
|
├── score
|
||||||
|
└── text
|
||||||
|
|
||||||
|
1 directory, 3 files
|
||||||
|
```
|
||||||
|
score:识别路径得分
|
||||||
|
|
||||||
|
text:语音识别结果文件
|
||||||
|
|
||||||
|
|
||||||
|
- 若输入音频为已解析的audio音频,api调用方式可参考如下范例:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import soundfile
|
||||||
|
|
||||||
|
waveform, sample_rate = soundfile.read("asr_example_zh.wav")
|
||||||
|
rec_result = inference_pipeline(input=waveform)
|
||||||
|
```
|
||||||
|
|
||||||
|
- ASR、VAD、PUNC模型自由组合
|
||||||
|
|
||||||
|
可根据使用需求对VAD和PUNC标点模型进行自由组合,使用方式如下:
|
||||||
|
```python
|
||||||
|
inference_pipeline = pipeline(
|
||||||
|
task=Tasks.auto_speech_recognition,
|
||||||
|
model='iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch', model_revision="v2.0.4",
|
||||||
|
vad_model='iic/speech_fsmn_vad_zh-cn-16k-common-pytorch', vad_model_revision="v2.0.4",
|
||||||
|
punc_model='iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch', punc_model_revision="v2.0.4",
|
||||||
|
# spk_model="iic/speech_campplus_sv_zh-cn_16k-common",
|
||||||
|
# spk_model_revision="v2.0.2",
|
||||||
|
)
|
||||||
|
```
|
||||||
|
若不使用PUNC模型,可配置punc_model="",或不传入punc_model参数,如需加入LM模型,可增加配置lm_model='damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch',并设置lm_weight和beam_size参数。
|
||||||
|
|
||||||
|
## 基于FunASR进行推理
|
||||||
|
|
||||||
|
下面为快速上手教程,测试音频([中文](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav),[英文](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav))
|
||||||
|
|
||||||
|
### 可执行命令行
|
||||||
|
在命令行终端执行:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=vad_example.wav
|
||||||
|
```
|
||||||
|
|
||||||
|
注:支持单条音频文件识别,也支持文件列表,列表为kaldi风格wav.scp:`wav_id wav_path`
|
||||||
|
|
||||||
|
### python示例
|
||||||
|
#### 非实时语音识别
|
||||||
|
```python
|
||||||
|
from funasr import AutoModel
|
||||||
|
# paraformer-zh is a multi-functional asr model
|
||||||
|
# use vad, punc, spk or not as you need
|
||||||
|
model = AutoModel(model="paraformer-zh", model_revision="v2.0.4",
|
||||||
|
vad_model="fsmn-vad", vad_model_revision="v2.0.4",
|
||||||
|
punc_model="ct-punc-c", punc_model_revision="v2.0.4",
|
||||||
|
# spk_model="cam++", spk_model_revision="v2.0.2",
|
||||||
|
)
|
||||||
|
res = model.generate(input=f"{model.model_path}/example/asr_example.wav",
|
||||||
|
batch_size_s=300,
|
||||||
|
hotword='魔搭')
|
||||||
|
print(res)
|
||||||
|
```
|
||||||
|
注:`model_hub`:表示模型仓库,`ms`为选择modelscope下载,`hf`为选择huggingface下载。
|
||||||
|
|
||||||
|
#### 实时语音识别
|
||||||
|
|
||||||
|
```python
|
||||||
|
from funasr import AutoModel
|
||||||
|
|
||||||
|
chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
|
||||||
|
encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
|
||||||
|
decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
|
||||||
|
|
||||||
|
model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.4")
|
||||||
|
|
||||||
|
import soundfile
|
||||||
|
import os
|
||||||
|
|
||||||
|
wav_file = os.path.join(model.model_path, "example/asr_example.wav")
|
||||||
|
speech, sample_rate = soundfile.read(wav_file)
|
||||||
|
chunk_stride = chunk_size[1] * 960 # 600ms
|
||||||
|
|
||||||
|
cache = {}
|
||||||
|
total_chunk_num = int(len((speech)-1)/chunk_stride+1)
|
||||||
|
for i in range(total_chunk_num):
|
||||||
|
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
|
||||||
|
is_final = i == total_chunk_num - 1
|
||||||
|
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
|
||||||
|
print(res)
|
||||||
|
```
|
||||||
|
|
||||||
|
注:`chunk_size`为流式延时配置,`[0,10,5]`表示上屏实时出字粒度为`10*60=600ms`,未来信息为`5*60=300ms`。每次推理输入为`600ms`(采样点数为`16000*0.6=960`),输出为对应文字,最后一个语音片段输入需要设置`is_final=True`来强制输出最后一个字。
|
||||||
|
|
||||||
|
#### 语音端点检测(非实时)
|
||||||
|
```python
|
||||||
|
from funasr import AutoModel
|
||||||
|
|
||||||
|
model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
|
||||||
|
|
||||||
|
wav_file = f"{model.model_path}/example/asr_example.wav"
|
||||||
|
res = model.generate(input=wav_file)
|
||||||
|
print(res)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 语音端点检测(实时)
|
||||||
|
```python
|
||||||
|
from funasr import AutoModel
|
||||||
|
|
||||||
|
chunk_size = 200 # ms
|
||||||
|
model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
|
||||||
|
|
||||||
|
import soundfile
|
||||||
|
|
||||||
|
wav_file = f"{model.model_path}/example/vad_example.wav"
|
||||||
|
speech, sample_rate = soundfile.read(wav_file)
|
||||||
|
chunk_stride = int(chunk_size * sample_rate / 1000)
|
||||||
|
|
||||||
|
cache = {}
|
||||||
|
total_chunk_num = int(len((speech)-1)/chunk_stride+1)
|
||||||
|
for i in range(total_chunk_num):
|
||||||
|
speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
|
||||||
|
is_final = i == total_chunk_num - 1
|
||||||
|
res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
|
||||||
|
if len(res[0]["value"]):
|
||||||
|
print(res)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 标点恢复
|
||||||
|
```python
|
||||||
|
from funasr import AutoModel
|
||||||
|
|
||||||
|
model = AutoModel(model="ct-punc", model_revision="v2.0.4")
|
||||||
|
|
||||||
|
res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
|
||||||
|
print(res)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 时间戳预测
|
||||||
|
```python
|
||||||
|
from funasr import AutoModel
|
||||||
|
|
||||||
|
model = AutoModel(model="fa-zh", model_revision="v2.0.4")
|
||||||
|
|
||||||
|
wav_file = f"{model.model_path}/example/asr_example.wav"
|
||||||
|
text_file = f"{model.model_path}/example/text.txt"
|
||||||
|
res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
|
||||||
|
print(res)
|
||||||
|
```
|
||||||
|
|
||||||
|
更多详细用法([示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining))
|
||||||
|
|
||||||
|
|
||||||
|
## 微调
|
||||||
|
|
||||||
|
详细用法([示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining))
|
||||||
|
|
||||||
|
## 使用方式以及适用范围
|
||||||
|
|
||||||
|
运行范围
|
||||||
|
- 支持Linux-x86_64、Mac和Windows运行。
|
||||||
|
|
||||||
|
使用方式
|
||||||
|
- 直接推理:可以直接对输入音频进行解码,输出目标文字。
|
||||||
|
|
||||||
|
使用范围与目标场景
|
||||||
|
- 适合于离线语音识别场景
|
||||||
|
|
||||||
|
|
||||||
|
## 模型局限性以及可能的偏差
|
||||||
|
|
||||||
|
考虑到特征提取流程和工具以及训练工具差异,会对CER的数据带来一定的差异(<0.1%),推理GPU环境差异导致的RTF数值差异。
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## 相关论文以及引用信息
|
||||||
|
|
||||||
|
```BibTeX
|
||||||
|
@inproceedings{radford2023robust,
|
||||||
|
title={Robust speech recognition via large-scale weak supervision},
|
||||||
|
author={Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya},
|
||||||
|
booktitle={International Conference on Machine Learning},
|
||||||
|
pages={28492--28518},
|
||||||
|
year={2023},
|
||||||
|
organization={PMLR}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
38
config.yaml
Normal file
38
config.yaml
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
# network architecture
|
||||||
|
model: WhisperWarp
|
||||||
|
model_conf:
|
||||||
|
lsm_weight: 0.1
|
||||||
|
length_normalized_loss: true
|
||||||
|
hub: funasr # openai
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# only use for hub == funasr,
|
||||||
|
# if hub == openai, dims is automaticall download
|
||||||
|
dims:
|
||||||
|
n_mels: 128
|
||||||
|
n_vocab: 51866
|
||||||
|
n_audio_ctx: 1500
|
||||||
|
n_audio_state: 1280
|
||||||
|
n_audio_head: 20
|
||||||
|
n_audio_layer: 32
|
||||||
|
n_text_ctx: 448
|
||||||
|
n_text_state: 1280
|
||||||
|
n_text_head: 20
|
||||||
|
n_text_layer: 32
|
||||||
|
|
||||||
|
# frontend related
|
||||||
|
frontend: WhisperFrontend
|
||||||
|
frontend_conf:
|
||||||
|
fs: 16000
|
||||||
|
n_mels: ${dims.n_mels}
|
||||||
|
do_pad_trim: true
|
||||||
|
|
||||||
|
tokenizer: WhisperTokenizer
|
||||||
|
tokenizer_conf:
|
||||||
|
language: null
|
||||||
|
task: transcribe
|
||||||
|
is_multilingual: true
|
||||||
|
num_languages: 100
|
||||||
|
|
||||||
|
scope_map: [none, "model."]
|
||||||
12
configuration.json
Normal file
12
configuration.json
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
{
|
||||||
|
"framework": "pytorch",
|
||||||
|
"task" : "auto-speech-recognition",
|
||||||
|
"model": {"type" : "funasr"},
|
||||||
|
"pipeline": {"type":"funasr-pipeline"},
|
||||||
|
"model_name_in_hub": {
|
||||||
|
"ms":"",
|
||||||
|
"hf":""},
|
||||||
|
"file_path_metas": {
|
||||||
|
"init_param":"large-v3.pt",
|
||||||
|
"config":"config.yaml"}
|
||||||
|
}
|
||||||
BIN
example/asr_example.wav
Normal file
BIN
example/asr_example.wav
Normal file
Binary file not shown.
BIN
large-v3.pt
(Stored with Git LFS)
Normal file
BIN
large-v3.pt
(Stored with Git LFS)
Normal file
Binary file not shown.
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
openai-whisper
|
||||||
Loading…
Reference in New Issue
Block a user