jeonghoonkang · jeonghoonkang · Jul 26, 2025
diff --git a/apps/qwen_streamlit_qa/README.md b/apps/qwen_streamlit_qa/README.md
@@ -8,10 +8,22 @@
    ```bash
    pip install streamlit transformers huggingface_hub
    ```
-   환경 변수 `QWEN_MODEL`에 로컬 모델 경로 또는 HuggingFace 모델 이름을 지정할 수 있습니다.
+  환경 변수 `QWEN_MODEL`에 로컬 모델 경로 또는 HuggingFace 모델 이름을 지정할 수 있습니다.
+  GGUF 모델을 사용할 경우 `QWEN_GGUF_MODEL` 환경 변수로 모델 파일 경로를 설정할 수 있습니다.
 2. 앱 실행
    ```bash
    streamlit run app.py
    ```
 
-기본적으로 `Qwen/Qwen1.5-7B-Chat` 모델을 사용하며, 로컬에 모델이 없으면 다운로드합니다. GPU 사용 가능 여부도 시작 시 표시됩니다.
+기본적으로 `Qwen/Qwen1.5-7B-Chat` 모델을 사용하며, 로컬에 모델이 없으면 다운로드합니다.
+사이드바의 **모델 선택** 메뉴에서 `Qwen1.5-1.8B-Chat (GGUF)` 모델을 선택해 사용할 수도 있습니다.
+GGUF 모델을 사용할 때는 `llama_cpp` 패키지가 필요합니다.
+앱을 실행하면 화면 상단에 GPU 사용 가능 여부가 표시되고, 이어서 현재 GPU 메모리 사용량과
+모델이 지원하는 최대 입력 토큰 수가 함께 보여집니다.
+모델 로딩 시 `device_map="auto"` 옵션을 사용하여 가용한 GPU가 자동으로 활용됩니다.
+
+질문 입력창 옆에는 사람 이모지가, 답변 출력에는 귀여운 로봇 이모지가 표시됩니다.
+
+### 오류 확인
+
+모델 실행 중 문제가 발생하면 화면 하단에 오류 메시지와 함께 상세 내용을 볼 수 있는 창이 나타납니다.
diff --git a/apps/qwen_streamlit_qa/app.py b/apps/qwen_streamlit_qa/app.py
@@ -14,23 +14,46 @@ def rerun() -> None:
         st.rerun()
 
 
-def display_gpu_status() -> None:
-    """Display current GPU status at startup."""
+def display_gpu_status(tokenizer=None) -> None:
+    """Display current GPU and model info at startup."""
     try:
         import torch
 
         if torch.cuda.is_available():
-            gpus = [f"{i}: {torch.cuda.get_device_name(i)}" for i in range(torch.cuda.device_count())]
-            st.info("GPU available: " + ", ".join(gpus))
+            gpu_names = [f"{i}: {torch.cuda.get_device_name(i)}" for i in range(torch.cuda.device_count())]
+            st.info("GPU available: " + ", ".join(gpu_names))
+
+            mem_info = []
+            for i in range(torch.cuda.device_count()):
+                total = torch.cuda.get_device_properties(i).total_memory // (1024 ** 2)
+                allocated = torch.cuda.memory_allocated(i) // (1024 ** 2)
+                mem_info.append(f"{i}: {allocated}MB/{total}MB")
+            st.info("GPU memory usage: " + ", ".join(mem_info))
         else:
             st.info("GPU not available, using CPU")
-    except Exception as exc:
+    except Exception as exc:  # pragma: no cover - GPU inspection can fail
         st.warning(f"Could not determine GPU status: {exc}")
 
+    if tokenizer is not None:
+        try:
+            st.info(f"Max input tokens: {getattr(tokenizer, 'model_max_length', 'unknown')}")
+        except Exception:  # pragma: no cover - tokenizer may be malformed
+            pass
+
 st.set_page_config(page_title="Qwen Q&A", page_icon="🎃")
 
 st.title("Qwen 기반 Q&A 데모")
-display_gpu_status()
+
+DEFAULT_MODEL = os.environ.get("QWEN_MODEL", "Qwen/Qwen1.5-7B-Chat")
+GGUF_MODEL = os.environ.get("QWEN_GGUF_MODEL", "Qwen/Qwen1.5-1.8B-Chat-GGUF")
+
+MODEL_OPTIONS = {
+    "Qwen 7B Chat": DEFAULT_MODEL,
+    "Qwen1.5-1.8B-Chat (GGUF)": GGUF_MODEL,
+}
+
+model_choice = st.sidebar.selectbox("모델 선택", list(MODEL_OPTIONS.keys()))
+MODEL_NAME = MODEL_OPTIONS[model_choice]
 
 
 def download_model(model_name: str) -> None:
@@ -108,22 +131,49 @@ def ensure_model(model_name: str) -> None:
     else:
         st.stop()
 
-MODEL_NAME = os.environ.get("QWEN_MODEL", "Qwen/Qwen1.5-7B-Chat")
 ensure_model(MODEL_NAME)
 
 
 @st.cache_resource
 def load_model(name: str):
+    """Load either a transformers or gguf model depending on selection."""
+    if name == GGUF_MODEL:
+        try:
+            from llama_cpp import Llama
+        except Exception:
+            st.error("llama_cpp 패키지가 필요합니다")
+            st.stop()
+
+        llm = Llama(model_path=name)
+
+        def _generate(prompt: str, max_length: int = 512):
+            result = llm.create_completion(prompt, max_tokens=max_length)
+            return result["choices"][0]["text"]
+
+        return _generate
+
     tokenizer = AutoTokenizer.from_pretrained(name)
-    model = AutoModelForCausalLM.from_pretrained(name)
+    # Let transformers automatically place model weights on the best device.
+    model = AutoModelForCausalLM.from_pretrained(name, device_map="auto")
     generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
     return generator
 
 
 generator = load_model(MODEL_NAME)
+display_gpu_status(getattr(generator, "tokenizer", None))
 
-prompt = st.text_input("질문을 입력하세요:")
+prompt = st.text_input("👤 질문을 입력하세요:")
+error_area = st.empty()
 if prompt:
-    with st.spinner("답변 생성 중..."):
-        response = generator(prompt, max_length=512, do_sample=True)
-        st.write(response[0]["generated_text"][len(prompt):].strip())
+    try:
+        with st.spinner("답변 생성 중..."):
+            if MODEL_NAME == GGUF_MODEL:
+                answer = generator(prompt, max_length=512)
+            else:
+                response = generator(prompt, max_length=512, do_sample=True)
+                answer = response[0]["generated_text"][len(prompt):].strip()
+        st.write("🤖 " + answer)
+    except Exception as exc:  # pragma: no cover - GUI display
+        error_area.error("답변 생성 중 오류가 발생했습니다.")
+        with st.expander("오류 상세 보기"):
+            st.exception(exc)