AXERA-TECH
/

SenseVoice

Automatic Speech Recognition

Model card Files Files and versions

SenseVoice / python /main.py

inoryQwQ's picture

Fix python

e138696 4 months ago

history blame contribute delete

2.11 kB

	import os
	import argparse
	from SenseVoiceAx import SenseVoiceAx
	import librosa
	import time


	def get_args():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--input", "-i", required=True, type=str, help="Input audio file"
	)
	parser.add_argument(
	"--language",
	"-l",
	required=False,
	type=str,
	default="auto",
	choices=["auto", "zh", "en", "yue", "ja", "ko"],
	)
	parser.add_argument("--streaming", action="store_true")
	return parser.parse_args()


	def main():
	args = get_args()
	print(vars(args))

	input_audio = args.input
	language = args.language
	model_root = "../sensevoice_ax650"
	if not args.streaming:
	max_seq_len = 256
	model_path = os.path.join(model_root, "sensevoice.axmodel")
	else:
	max_seq_len = 26
	model_path = os.path.join(model_root, "streaming_sensevoice.axmodel")

	assert os.path.exists(model_path), f"model {model_path} not exist"

	cmvn_file = os.path.join(model_root, "am.mvn")
	bpe_model = os.path.join(model_root, "chn_jpn_yue_eng_ko_spectok.bpe.model")
	token_file = os.path.join(model_root, "tokens.txt")

	model = SenseVoiceAx(
	model_path,
	cmvn_file,
	token_file,
	bpe_model,
	max_seq_len=max_seq_len,
	beam_size=3,
	hot_words=None,
	streaming=args.streaming,
	)

	if not args.streaming:
	asr_res = model.infer(input_audio, language, print_rtf=True)
	print("ASR result: " + asr_res)
	else:
	samples, sr = librosa.load(input_audio, sr=16000)
	samples = (samples * 32768).tolist()
	duration = len(samples) / 16000

	start = time.time()
	step = int(0.1 * sr)
	for i in range(0, len(samples), step):
	is_last = i + step >= len(samples)
	for res in model.stream_infer(samples[i : i + step], is_last, language):
	print(res)

	end = time.time()
	cost_time = end - start

	print(f"RTF: {cost_time / duration}")


	if __name__ == "__main__":
	main()