diff --git a/README.md b/README.md index 9fcca13..b0a10dd 100644 --- a/README.md +++ b/README.md @@ -17,25 +17,26 @@ LIRA is a **CLI-first, developer-friendly tool**: run and serve ASR models local - **Python 3.10** is required. - We recommend using **conda** for environment management. -- For RyzenAI NPU flow, follow the [RyzenAI installation instructions](https://ryzenai.docs.amd.com/en/latest/inst.html) and verify drivers/runtime for your device. +- For RyzenAI NPU flow, follow the [RyzenAI installation instructions](https://ryzenai.docs.amd.com/en/latest/inst.html) and verify drivers/runtime for your device. Ensure that you have a Ryzen AI 300 Series machine to nebale NPU use cases +- Current recommended Ryzen AI Version: RAI 1.5.1 with 32.0.203.280 driver **Minimal install steps:** 1. **Clone the repo and change directory:** - ```bash - git clone https://github.com/aigdat/LIRA.git - cd LIRA - ``` + ```bash + git clone https://github.com/aigdat/LIRA.git + cd LIRA + ``` 2. **Activate your conda environment:** - ```bash - conda activate ryzen-ai-1.5.0 - ``` + ```bash + conda activate ryzen-ai-1.5.0 + ``` 3. **Install LIRA in editable mode:** - ```bash - pip install -e . - ``` + ```bash + pip install -e . + ``` Now you can run `lira --help` to see available commands. @@ -74,13 +75,13 @@ LIRA includes a FastAPI-based HTTP server for rapid integration with your applic **Start the server:** - **CPU acceleration:** - ```bash - lira serve --backend openai --model whisper-base --device cpu --host 0.0.0.0 --port 5000 - ``` + ```bash + lira serve --backend openai --model whisper-base --device cpu --host 0.0.0.0 --port 5000 + ``` - **NPU acceleration:** - ```bash - lira serve --backend openai --model whisper-base --device npu --host 0.0.0.0 --port 5000 - ``` + ```bash + lira serve --backend openai --model whisper-base --device npu --host 0.0.0.0 --port 5000 + ``` > Interested in more server features? > Try the **LIRA server demo** with Open WebUI. @@ -179,15 +180,6 @@ LIRA supports multiple speech-model architectures. Runtime support depends on th *NPU support depends on available Vitis AI export artifacts and target hardware. ---- - -## 📚 Datasets & Examples - -- `datasets/LibriSpeech` contains sample audio and transcripts for quick testing. - Replace or augment with your own data for benchmarking. - ---- - ## 🧪 Early Access & Open Source Intentions LIRA is released as an open, community-driven project. diff --git a/evaluate/evaluate_models.py b/evaluate/evaluate_models.py deleted file mode 100644 index d922864..0000000 --- a/evaluate/evaluate_models.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright(C) 2025 Advanced Micro Devices, Inc. All rights reserved. - -import os -import json -import torchaudio -import numpy as np -from lira.whisper.transcribe import WhisperONNX -from lira.zipformer.transcribe import EncoderWrapper, DecoderWrapper, JoinerWrapper, CHUNK_LEN -from lira.utils.audio import extract_fbank -from lira.utils.tokens import load_tokens -from jiwer import wer, cer -import argparse -from lira.cli.run_asr import greedy_search - -SAMPLE_RATE = 16000 - -def transcribe_whisper(encoder_path, decoder_path, audio_path, providers): - whisper_model = WhisperONNX(encoder_path, decoder_path, providers=providers) - waveform, sr = torchaudio.load(audio_path) - if sr != SAMPLE_RATE: - waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)(waveform) - audio = waveform.squeeze(0).numpy() - return whisper_model.transcribe(audio) - -def transcribe_zipformer(encoder_path, decoder_path, joiner_path, tokens_path, audio_path, providers): - encoder = EncoderWrapper(encoder_path, providers=providers) - decoder = DecoderWrapper(decoder_path, providers=providers) - joiner = JoinerWrapper(joiner_path, providers=providers) - tokens = load_tokens(tokens_path) - - waveform, sr = torchaudio.load(audio_path) - if sr != SAMPLE_RATE: - waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)(waveform) - audio = waveform.squeeze(0).numpy() - features = extract_fbank(audio) - - state = {} - return greedy_search(encoder, decoder, joiner, features, tokens, state) - -def evaluate_models(dataset_path, ground_truths, model_configs): - results = [] - for model_config in model_configs: - model_type = model_config['type'] - print(f"Evaluating {model_type} model...") - for wav_file, ground_truth in ground_truths.items(): - audio_path = os.path.join(dataset_path, wav_file+".wav") - if model_type == 'whisper': - transcription = transcribe_whisper( - model_config['encoder'], - model_config['decoder'], - audio_path, - model_config['providers'] - ) - elif model_type == 'zipformer': - transcription = transcribe_zipformer( - model_config['encoder'], - model_config['decoder'], - model_config['joiner'], - model_config['tokens'], - audio_path, - model_config['providers'] - ) - else: - raise ValueError(f"Unsupported model type: {model_type}") - - wer_score = wer(ground_truth, transcription) - cer_score = cer(ground_truth, transcription) - results.append({ - 'model': model_type, - 'file': wav_file, - 'wer': wer_score, - 'cer': cer_score, - 'transcription': transcription, - 'ground_truth': ground_truth - }) - - return results - -def parse_ground_truth(ground_truth_path): - ground_truths = {} - with open(ground_truth_path, 'r', encoding='utf-8') as f: - for line in f: - parts = line.strip().split(maxsplit=1) - if len(parts) == 2: - wav_file, transcript = parts - ground_truths[wav_file] = transcript - return ground_truths - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--dataset-path", required=True, help="Path to the folder containing .wav files") - parser.add_argument("--ground-truth-path", required=True, help="Path to the ground truth .txt file") - args = parser.parse_args() - - dataset_path = args.dataset_path - ground_truth_path = args.ground_truth_path - - ground_truths = parse_ground_truth(ground_truth_path) - - model_configs = [ - # { - # 'type': 'whisper', - # 'encoder': 'path/to/whisper-encoder.onnx', - # 'decoder': 'path/to/whisper-decoder.onnx', - # 'providers': ['CPUExecutionProvider'] - # }, - { - 'type': 'zipformer', - 'encoder': 'C:\\Users\\ISWAALEX\\DAToolkit\\sandbox\\asr_sandbox\\encoder.onnx', - 'decoder': 'C:\\Users\\ISWAALEX\\DAToolkit\\sandbox\\asr_sandbox\\decoder.onnx', - 'joiner': 'C:\\Users\\ISWAALEX\\DAToolkit\\sandbox\\asr_sandbox\\joiner.onnx', - 'tokens': 'C:\\Users\\ISWAALEX\\DAToolkit\\sandbox\\asr_sandbox\\tokens.txt', - 'providers': ['CPUExecutionProvider'] - } - ] - - results = evaluate_models(dataset_path, ground_truths, model_configs) - for result in results: - print(json.dumps(result, indent=2)) - -if __name__ == "__main__": - main() diff --git a/main.py b/main.py deleted file mode 100644 index 7dc7b69..0000000 --- a/main.py +++ /dev/null @@ -1,155 +0,0 @@ -# Copyright(C) 2025 Advanced Micro Devices, Inc. All rights reserved. - -import os -import gradio as gr -import torchaudio -import torch -import numpy as np -from pathlib import Path -from huggingface_hub import snapshot_download - -from lira.zipformer.transcribe import EncoderWrapper, DecoderWrapper, JoinerWrapper -from lira.whisper.transcribe import WhisperONNX -from lira.utils.audio import extract_fbank -from lira.utils.audio import get_model_providers -from lira.cli.run_asr import greedy_search, mic_stream - -SAMPLE_RATE = 16000 - - -def download_and_prepare_models(model_type, target_device, config_path="config/model_config.json"): - providers = get_model_providers( - model_type=model_type, - device=target_device, - config_path=config_path - ) - - if model_type == "whisper": - model_dir = snapshot_download(repo_id="aigdat/AMD-Whisper-Base", cache_dir="./hf_models") - encoder_path = str(Path(model_dir / "whisper-encoder.onnx")) - decoder_path = str(Path(model_dir / "whisper-decoder.onnx")) - whisper_model = WhisperONNX( - str(encoder_path), - str(decoder_path), - encoder_provider=providers["encoder"], - decoder_provider=providers["decoder"] - ) - return {"model": whisper_model} - - elif model_type == "zipformer": - model_dir = snapshot_download(repo_id="aigdat/AMD-zipformer-en", cache_dir="./hf_models") - encoder = EncoderWrapper(str(Path(model_dir) / "encoder.onnx"), providers=providers["encoder"]) - decoder = DecoderWrapper(str(Path(model_dir) / "decoder.onnx"), providers=providers["decoder"]) - joiner = JoinerWrapper(str(Path(model_dir) / "joiner.onnx"), providers=providers["joiner"]) - tokens = [] - with open(Path(model_dir) / "tokens.txt", 'r', encoding='utf-8') as f: - tokens = [line.strip().split()[0] for line in f] - return {"encoder": encoder, "decoder": decoder, "joiner": joiner, "tokens": tokens} - - else: - raise ValueError(f"Unsupported model type: {model_type}") - - -def transcribe_audio(audio, model_type, target_device, duration=5): - with gr.Row(): - status = gr.Textbox(value="⏳ Downloading and preparing model...", visible=True, interactive=False) - models = download_and_prepare_models(model_type, target_device) - status.value = "✅ Model ready!" - - if isinstance(audio, np.ndarray): - # Mic input from Gradio - sr = SAMPLE_RATE - audio = audio[:, 0] if audio.ndim > 1 else audio - - elif isinstance(audio, tuple): - # Uploaded file - if len(audio) != 2 or not isinstance(audio[0], (np.ndarray, torch.Tensor)): - return "⚠️ Invalid audio data." - audio, sr = audio - audio = audio[:, 0] if audio.ndim > 1 else audio - - elif isinstance(audio, str) and os.path.exists(audio): - # File path - waveform, sr = torchaudio.load(audio) - audio = waveform.squeeze(0).numpy() - - else: - return "⚠️ No valid audio input detected." - - if sr != SAMPLE_RATE: - audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)(torch.tensor(audio)).numpy() - - if model_type == "whisper": - whisper_model = models["model"] - text = whisper_model.transcribe(audio) - return text - - elif model_type == "zipformer": - encoder = models["encoder"] - decoder = models["decoder"] - joiner = models["joiner"] - tokens = models["tokens"] - features = extract_fbank(audio) - state = {} - text = greedy_search(encoder, decoder, joiner, features, tokens, state) - return text - - -def start_mic_stream(model_type, target_device, duration=5): - models = download_and_prepare_models(model_type, target_device) - - if model_type == "whisper": - whisper_model = models["model"] - - def transcribe_fn(audio, state): - return whisper_model.transcribe(audio) - - elif model_type == "zipformer": - encoder = models["encoder"] - decoder = models["decoder"] - joiner = models["joiner"] - tokens = models["tokens"] - - def transcribe_fn(features, state): - return greedy_search(encoder, decoder, joiner, features, tokens, state) - - mic_stream(transcribe_fn, duration) - return "🗣️ Transcription from microphone completed. See console for live text." - - -def build_interface(): - with gr.Blocks() as demo: - gr.Markdown("# 🗣️ LIAR: Local Interface for Audio Recognition") - gr.Markdown("### ASR Demo supporting Zipformer and Whisper") - - model_selector = gr.Dropdown(["zipformer", "whisper"], label="Model Type", value="zipformer") - device_selector = gr.Dropdown(["cpu", "npu"], label="Device Selector", value="cpu") - duration_input = gr.Number(label="Mic Input Duration (seconds)", value=5) - - with gr.Tab("Upload WAV File"): - file_input = gr.Audio(sources="upload", type="filepath", label="Upload a .wav file") - file_button = gr.Button("Transcribe File") - file_output = gr.Textbox(label="Transcription") - - with gr.Tab("Mic Input"): - mic_button = gr.Button("Start Mic Transcription") - mic_output = gr.Textbox(label="Mic Output") - - file_button.click( - fn=transcribe_audio, - inputs=[file_input, model_selector, device_selector], - outputs=file_output - ) - - mic_button.click( - fn=start_mic_stream, - inputs=[model_selector, device_selector, duration_input], - outputs=mic_output - ) - - return demo - - -if __name__ == "__main__": - demo = build_interface() - demo.launch() diff --git a/setup.py b/setup.py index 13fb158..6b45973 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ "torch", "torchaudio", "sounddevice", - "transformers==4.38.0", + "transformers==4.52.4", "soundfile", "gradio", "jiwer",