From 8a3f2241d55da1db6931593e5446a538e17f094f Mon Sep 17 00:00:00 2001 From: JavanTang Date: Mon, 23 Jun 2025 11:44:47 +0800 Subject: [PATCH 1/3] Added Kimi-Audio web demo, supporting text and voice input, with AI generating both text and voice responses, including chat history and model parameter adjustment features. --- kimi_web_demo.py | 419 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 419 insertions(+) create mode 100644 kimi_web_demo.py diff --git a/kimi_web_demo.py b/kimi_web_demo.py new file mode 100644 index 0000000..9a5e23d --- /dev/null +++ b/kimi_web_demo.py @@ -0,0 +1,419 @@ +#!/usr/bin/env python3 +import os +import gradio as gr +import soundfile as sf +import time +import tempfile +import logging +from datetime import datetime +from kimia_infer.api.kimia import KimiAudio + +# Setup logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + + +class KimiWebChat: + def __init__( + self, + model_path="moonshotai/Kimi-Audio-7B-Instruct", + output_dir="output", + ): + """Initialize Kimi Audio Web Chat""" + self.output_dir = output_dir + os.makedirs(output_dir, exist_ok=True) + + # Model configuration + logger.info(f"Loading model: {model_path}") + self.model = KimiAudio(model_path=model_path, load_detokenizer=True) + logger.info("Model loaded successfully!") + + # Default sampling parameters + self.sampling_params = { + "audio_temperature": 0.8, + "audio_top_k": 10, + "text_temperature": 0.0, + "text_top_k": 5, + "audio_repetition_penalty": 1.0, + "audio_repetition_window_size": 64, + "text_repetition_penalty": 1.0, + "text_repetition_window_size": 16, + } + + # Audio sample rate + self.sample_rate = 24000 + + # Chat history + self.chat_history = [] + + def process_chat(self, user_input, audio_input, history): + """Process chat request""" + try: + # Build message list + messages = [] + + # Add historical messages + for h in history: + if h[0]: # User message + if isinstance(h[0], list): + messages.append( + {"role": "user", "message_type": "text", "content": h[0][0]} + ) + else: + messages.append( + { + "role": "user", + "message_type": "audio", + "content": h[0][1], + } + ) + if h[1]: # Assistant message + messages.append( + { + "role": "assistant", + "message_type": "audio-text", + "content": h[1], + } + ) + + # Process current user input + audio_path = None # Initialize audio path + + if user_input: + messages.append( + {"role": "user", "message_type": "text", "content": user_input} + ) + + if audio_input is not None: + # Save audio file + timestamp = int(time.time()) + audio_path = os.path.join(self.output_dir, f"input_{timestamp}.wav") + sf.write(audio_path, audio_input[1], audio_input[0]) + messages.append( + {"role": "user", "message_type": "audio", "content": audio_path} + ) + logger.info(f"Saved user audio: {audio_path}") + + if not user_input and audio_input is None: + return history, history, None, "" + + # Generate response + logger.info("Generating response...") + wav_output, text_output = self.model.generate( + messages, **self.sampling_params, output_type="both" + ) + + # Process text response + text_reply = ( + text_output + if text_output + else "Sorry, I didn't generate a text response." + ) + + # Process audio response + audio_response = None + output_path = None + if wav_output is not None: + timestamp = int(time.time()) + output_path = os.path.join(self.output_dir, f"output_{timestamp}.wav") + audio_data = wav_output.detach().cpu().view(-1).numpy() + sf.write(output_path, audio_data, self.sample_rate) + audio_response = output_path + logger.info(f"Saved AI audio response: {output_path}") + + # Construct display message: if there's audio, show "šŸŽµ [Audio Response] + text", otherwise just text + if output_path: + bot_display_message = f"šŸŽµ [Audio Response]\n{text_reply}" + else: + bot_display_message = text_reply + + # Construct history message: format for next conversation [audio_path, text_content] + bot_history_message = ( + [output_path, text_reply] if output_path else text_reply + ) + + # Construct user message in different formats + # User message for display (Chatbot interface) + if user_input and audio_path: + user_display_msg = f"{user_input}\nšŸŽ¤ [Contains Audio]" + elif user_input: + user_display_msg = user_input + else: + user_display_msg = "šŸŽ¤ [Audio Message]" + + # User message for model processing (history format) + if user_input and audio_path: + user_history_msg = [user_input, audio_path] + elif user_input: + user_history_msg = user_input + else: + user_history_msg = audio_path + + # History for display (Chatbot display) + display_history = history + [[user_display_msg, bot_display_message]] + + # History for model processing (contains complete information) + model_history = history + [[user_history_msg, bot_history_message]] + + return display_history, model_history, audio_response, "" + + except Exception as e: + logger.error(f"Error processing chat: {str(e)}") + error_msg = f"Processing error: {str(e)}" + new_history = history + [[user_input or "[Audio Message]", error_msg]] + return new_history, new_history, None, "" + + def clear_chat(self): + """Clear chat history""" + logger.info("Clearing chat history") + return [], [], None, "" + + def update_params( + self, + audio_temp, + audio_topk, + text_temp, + text_topk, + audio_rep_penalty, + text_rep_penalty, + ): + """Update model parameters""" + self.sampling_params.update( + { + "audio_temperature": audio_temp, + "audio_top_k": int(audio_topk), + "text_temperature": text_temp, + "text_top_k": int(text_topk), + "audio_repetition_penalty": audio_rep_penalty, + "text_repetition_penalty": text_rep_penalty, + } + ) + logger.info(f"Parameters updated: {self.sampling_params}") + return "āœ… Parameters updated successfully!" + + +def create_demo(): + """Create Gradio interface""" + + # Initialize chat handler + chat_handler = KimiWebChat(args.model_path) + + # Custom CSS + css = """ + .chatbot { + height: 500px; + } + .audio-container { + margin: 10px 0; + } + """ + + with gr.Blocks(css=css, title="Kimi-Audio Voice Chatbot") as demo: + gr.Markdown("# šŸŽ¤ Kimi-Audio Voice Assistant") + gr.Markdown( + "Supports text and voice input, AI responds with both text and voice messages" + ) + + with gr.Tab("šŸ’¬ Chat"): + with gr.Row(): + with gr.Column(scale=3): + # Chat display area + chatbot = gr.Chatbot( + label="Conversation History", height=500, show_label=True + ) + + # User input area + with gr.Row(): + with gr.Column(scale=4): + user_input = gr.Textbox( + placeholder="Enter your message...", + label="Text Input", + lines=2, + ) + with gr.Column(scale=1): + send_btn = gr.Button("Send", variant="primary", size="lg") + + # Audio input + audio_input = gr.Audio( + label="Voice Input (Record or upload audio file)", type="numpy" + ) + + # Control buttons + with gr.Row(): + clear_btn = gr.Button("Clear Chat", variant="secondary") + + with gr.Column(scale=1): + # AI audio response + audio_output = gr.Audio( + label="šŸ¤– AI Voice Response", interactive=False, autoplay=True + ) + + # Quick tips + gr.Markdown("### šŸ’” Usage Tips") + gr.Markdown( + """ + - You can input text or record voice + - AI will respond with both text and voice + - Each conversation preserves history + - You can clear the conversation to start fresh + """ + ) + + with gr.Tab("āš™ļø Settings"): + gr.Markdown("### Model Parameter Adjustment") + + with gr.Row(): + with gr.Column(): + gr.Markdown("**Audio Generation Parameters**") + audio_temp = gr.Slider( + 0.0, + 2.0, + value=0.8, + step=0.1, + label="Audio Temperature", + info="Controls randomness in audio generation", + ) + audio_topk = gr.Slider( + 1, + 50, + value=10, + step=1, + label="Audio Top-K", + info="Controls number of candidate audio tokens", + ) + audio_rep_penalty = gr.Slider( + 0.1, + 3.0, + value=1.0, + step=0.1, + label="Audio Repetition Penalty", + info="Prevents repetitive audio segments", + ) + + with gr.Column(): + gr.Markdown("**Text Generation Parameters**") + text_temp = gr.Slider( + 0.0, + 2.0, + value=0.0, + step=0.1, + label="Text Temperature", + info="Controls randomness in text generation", + ) + text_topk = gr.Slider( + 1, + 50, + value=5, + step=1, + label="Text Top-K", + info="Controls number of candidate text tokens", + ) + text_rep_penalty = gr.Slider( + 0.1, + 3.0, + value=1.0, + step=0.1, + label="Text Repetition Penalty", + info="Prevents repetitive text", + ) + + update_btn = gr.Button("Apply Settings", variant="primary") + param_status = gr.Textbox(label="Settings Status", interactive=False) + + with gr.Tab("ā„¹ļø About"): + gr.Markdown( + """ + # Kimi-Audio Multimodal Chatbot + + ## 🌟 Features + - **Multimodal Interaction**: Supports text and voice input/output + - **Real-time Conversation**: Fast response, natural communication + - **History Recording**: Maintains conversation continuity + - **Parameter Adjustment**: Customizable generation parameters for optimal experience + + ## šŸš€ How to Use + 1. In the "Chat" tab, input text or record voice + 2. AI will generate both text and voice responses + 3. Adjust model parameters in the "Settings" tab + 4. Use "Clear Chat" to start a new conversation + + ## šŸ“ Technical Information + - Based on Kimi-Audio-7B-Instruct model + - Supports Chinese and English conversations + - Audio sample rate: 24kHz + + --- + + šŸ”— [Project Homepage](https://github.com/moonshotai/Kimi-Audio) + """ + ) + + # Hidden state variable for saving history + history_state = gr.State([]) + + # Event binding + def handle_chat(user_text, audio, history): + return chat_handler.process_chat(user_text, audio, history) + + # Send button event + send_btn.click( + fn=handle_chat, + inputs=[user_input, audio_input, history_state], + outputs=[chatbot, history_state, audio_output, user_input], + ) + + # Enter to send + user_input.submit( + fn=handle_chat, + inputs=[user_input, audio_input, history_state], + outputs=[chatbot, history_state, audio_output, user_input], + ) + + # Clear conversation + clear_btn.click( + fn=chat_handler.clear_chat, + outputs=[chatbot, history_state, audio_output, user_input], + ) + + # Update parameters + update_btn.click( + fn=chat_handler.update_params, + inputs=[ + audio_temp, + audio_topk, + text_temp, + text_topk, + audio_rep_penalty, + text_rep_penalty, + ], + outputs=param_status, + ) + + return demo + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Kimi-Audio Web Demo") + parser.add_argument( + "--model_path", + type=str, + default="moonshotai/Kimi-Audio-7B-Instruct", + help="Model path", + ) + parser.add_argument("--port", type=int, default=7860, help="Port number") + parser.add_argument("--share", action="store_true", help="Create public link") + + args = parser.parse_args() + + logger.info(f"Starting Kimi-Audio Web Demo") + logger.info(f"Model path: {args.model_path}") + logger.info(f"Port: {args.port}") + + demo = create_demo(args) + demo.launch( + server_port=args.port, server_name="0.0.0.0", share=args.share, show_api=False + ) From dc62c73b0396785120c41accd75665c137aa5257 Mon Sep 17 00:00:00 2001 From: JavanTang Date: Mon, 23 Jun 2025 11:52:46 +0800 Subject: [PATCH 2/3] fix: create_demo function not using args parameter --- kimi_web_demo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kimi_web_demo.py b/kimi_web_demo.py index 9a5e23d..0e6729c 100644 --- a/kimi_web_demo.py +++ b/kimi_web_demo.py @@ -194,7 +194,7 @@ def update_params( return "āœ… Parameters updated successfully!" -def create_demo(): +def create_demo(args): """Create Gradio interface""" # Initialize chat handler From 15bb340525377e31668dbb3bf72e4de81c7a6678 Mon Sep 17 00:00:00 2001 From: JavanTang Date: Mon, 23 Jun 2025 11:54:58 +0800 Subject: [PATCH 3/3] Update README.md, add the Gradio web demo section, including usage instructions and access link. --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 3ace831..4fcdb07 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,18 @@ print("Kimi-Audio inference examples complete.") ``` +### Gradio Web Demo + +![alt text](assets/kimia_gradio.png) + +```bash +python kimi_web_demo.py +``` +Then you can access the demo at `http://localhost:7860`. + + + + ## Evaluation Kimi-Audio achieves state-of-the-art (SOTA) performance across a wide range of audio benchmarks.