aitomatic · popiemon · Mar 5, 2025 · Mar 5, 2025 · Mar 5, 2025 · Mar 5, 2025
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,24 @@
+{
+    "name": "semikong",
+    "dockerComposeFile": "../compose.yaml",
+    "service": "semikong",
+    "workspaceFolder": "/workspace",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "GitHub.copilot-chat",
+                "codezombiech.gitignore",
+                "GitHub.copilot",
+                "saoudrizwan.claude-dev",
+                "njpwerner.autodocstring",
+                "ms-python.mypy-type-checker",
+                "ms-python.vscode-pylance",
+                "ms-python.python",
+                "ms-python.debugpy",
+                "ms-python.isort",
+                "ms-python.black-formatter"
+            ]
+        }
+    }
+}
+
diff --git a/.gitignore b/.gitignore
@@ -160,3 +160,7 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+*.lock
+results*
+data/*
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.10
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,17 @@
+FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
+
+# The installer requires curl (and certificates) to download the release archive
+RUN apt-get update && apt-get install -y --no-install-recommends curl=7.81.0-1ubuntu1.20 ca-certificates=20240203~22.04.1 git=1:2.34.1-1ubuntu1.12 vim=2:8.2.3995-1ubuntu2.23 && rm -rf /var/lib/apt/lists/*
+
+# Download the latest installer
+ADD https://astral.sh/uv/install.sh /uv-installer.sh
+
+# Run the installer then remove it
+RUN sh /uv-installer.sh && rm /uv-installer.sh
+
+# Ensure the installed binary is on the `PATH`
+ENV PATH="/opt/conda/bin:$PATH"
+ENV UV_PROJECT_ENVIRONMENT="/opt/conda"
+ENV UV_SYSTEM_PYTHON=true
+
+WORKDIR /workspace
diff --git a/README.md b/README.md
@@ -323,7 +323,17 @@ You can perform inference with SEMIKONG chat or base models as below.
 </p>
 
 ### Quick start - Docker 
-TBA
+You shold install Docker and VSCode.
+
+Install the Dev Containers extension.
+Please select "Reopen in Container."
+
+You should now be able to work with the SEMIKONG project inside the Docker container using VSCode.
+
+Run the following command to install all packages.
+```sh
+uv sync --all-extras
+```
 
 <p align="right"> [
   <a href="#top">Back to top ⬆️ </a>  ] 

diff --git a/compose.yaml b/compose.yaml
@@ -0,0 +1,18 @@
+services:
+  semikong:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    image: semikong
+    container_name: semikong
+    tty: true
+    stdin_open: true
+    volumes:
+      - ./:/workspace
+    working_dir: /workspace
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: [gpu]
diff --git a/configs/training-config.yaml b/configs/training-config.yaml
@@ -1,3 +1,5 @@
+demo: True
+
 model:
   model_name: "model_path_folder_or_model_name_hf"
   new_model: "semikong-8b"
@@ -7,7 +9,8 @@ model:
   bnb_4bit_quant_type: "nf4"
   device_map: {"": 0}
   output_dir: "./results"
-  dataset_name: "dataset_path_folder_or_dataset_name_hf"
+  dataset_name: /workspace/data/SemiKong_Training_Dataset/train-00000-of-00001.parquet
+  remove_columns: ["input"]
 
 training:
   num_train_epochs: 2

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,30 @@
+[project]
+name = "semikong"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+authors = [
+    { name = "", email = "" }
+]
+requires-python = ">=3.10"
+dependencies = [
+    "bitsandbytes>=0.45.3",
+    "datasets>=3.3.2",
+    "huggingface-hub>=0.29.1",
+    "loguru>=0.7.3",
+    "numpy>=2.2.3",
+    "peft>=0.14.0",
+    "transformers>=4.49.0",
+    "trl>=0.15.2",
+    "vllm>=0.5.0.post1",
+]
+
+[dependency-groups]
+dev = [
+    "black>=25.1.0",
+    "isort>=6.0.1",
+    "pre-commit>=4.1.0",
+    "pylint>=3.3.4",
+    "tqdm>=4.67.1",
+]
+
diff --git a/raw_inference.py b/raw_inference.py
@@ -1,10 +1,11 @@
 import argparse
-import logging
 
 import torch
+import transformers
 import yaml
 from peft import LoraConfig, PeftModel
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          BitsAndBytesConfig, pipeline)
 
 
 def load_config(config_file="./configs/inference-config.yaml"):
@@ -70,7 +71,7 @@ def text_gen_eval_wrapper(model, tokenizer, prompt, max_length=200, temperature=
     A wrapper function for inferencing, generating text based on a prompt.
     """
     # Suppress logging
-    logging.set_verbosity(logging.CRITICAL)
+    transformers.logging.set_verbosity(transformers.logging.CRITICAL)
 
     # Initialize text generation pipeline
     pipe = pipeline(

diff --git a/training.py b/training.py
@@ -4,7 +4,8 @@
 import yaml
 from datasets import load_dataset
 from peft import LoraConfig
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          BitsAndBytesConfig, TrainingArguments)
 from trl import SFTTrainer
 
 
@@ -51,6 +52,7 @@ def configure_lora(config):
         r=lora_params["lora_r"],
         bias="none",
         task_type="CAUSAL_LM",
+        target_modules=["q_proj", "v_proj"],
     )
 
 
@@ -71,7 +73,7 @@ def template_dataset(sample, tokenizer):
 def main():
     # Parse command-line arguments
     parser = argparse.ArgumentParser(description="Fine-tune a model with LoRA and 4-bit precision.")
-    parser.add_argument("--config", type=str, default="config.yaml", help="Path to the YAML config file.")
+    parser.add_argument("--config", type=str, default="configs/training-config.yaml", help="Path to the YAML config file.")
     args = parser.parse_args()
 
     # Load configuration
@@ -85,11 +87,12 @@ def main():
 
     # Load and process the dataset
     dataset_name = config["model"]["dataset_name"]
-    dataset = load_dataset("json", data_files=dataset_name, split="train")
+    extension = dataset_name.split(".")[-1]
+    dataset = load_dataset(extension, data_files=dataset_name, split="train")
     dataset = dataset.shuffle(seed=42)
-    dataset = dataset.select(range(50))  # Optional: select first 50 rows for demo
-    dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=list(dataset.features))
-
+    if config["demo"]:
+        dataset = dataset.select(range(50))  # Optional: select first 50 rows for demo
+    dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=config["model"]["remove_columns"])
     # Set up training arguments
     training_arguments = TrainingArguments(
         output_dir=config["model"]["output_dir"],
@@ -98,7 +101,7 @@ def main():
         optim=config["training"]["optim"],
         save_steps=config["training"]["save_steps"],
         logging_steps=config["training"]["logging_steps"],
-        learning_rate=config["training"]["learning_rate"],
+        learning_rate=float(config["training"]["learning_rate"]),
         fp16=config["training"]["fp16"],
         bf16=config["training"]["bf16"],
         max_grad_norm=config["training"]["max_grad_norm"],
@@ -113,11 +116,8 @@ def main():
         model=model,
         train_dataset=dataset,
         peft_config=peft_config,
-        dataset_text_field="text",
-        max_seq_length=config["training"]["max_seq_length"],
         tokenizer=tokenizer,
         args=training_arguments,
-        packing=config["training"]["packing"],
     )
 
     trainer.train()