Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
397f7d9
add: uv pyproject.toml
popiemon Mar 5, 2025
b9884e3
add: src
popiemon Mar 5, 2025
27bb485
add: Dockerfile
popiemon Mar 5, 2025
5965d75
add: devcontainer.json
popiemon Mar 5, 2025
83399c9
fix: toml
popiemon Mar 5, 2025
76f8caa
feat: ignore uv.lock
popiemon Mar 5, 2025
0c0ed76
add: .env.sample
popiemon Mar 5, 2025
d8d703f
fix: delete src
popiemon Mar 5, 2025
4b3442b
add: update dependencies to include huggingface-hub
popiemon Mar 5, 2025
d8aa952
feat: ignore results
popiemon Mar 5, 2025
e5f0946
feat: pft success
popiemon Mar 5, 2025
643d9dd
add: dev group
popiemon Mar 5, 2025
645a25e
docs: Docker
popiemon Mar 5, 2025
6de1bd6
feat: remove_columns
popiemon Mar 5, 2025
26c8119
feat: ignore results
popiemon Mar 5, 2025
14287e4
del: .env
popiemon Mar 5, 2025
3f61dbe
fix: model_name
popiemon Mar 5, 2025
a41e4c7
docs: uv sync
popiemon Mar 5, 2025
8ba3fdd
fix: logging
popiemon Mar 5, 2025
319fdbf
Update Dockerfile
popiemon Mar 5, 2025
d8d0bec
Update training.py
popiemon Mar 5, 2025
856ac3e
fix: devcontainer.json
popiemon Mar 5, 2025
ca6f62e
Merge branch 'feature/#15' of https://github.com/popiemon/semikong in…
popiemon Mar 5, 2025
3e837e4
fix: Dockerfile
popiemon Mar 5, 2025
8b8e1bb
feat: package version
popiemon Mar 5, 2025
be7ee11
revert training-config
popiemon Mar 5, 2025
d030943
fix: dataset.map
popiemon Mar 7, 2025
5bfcc5f
feat: ignore data
popiemon Mar 7, 2025
be908e3
feat: training config
popiemon Mar 7, 2025
667d955
Update training.py
popiemon Mar 7, 2025
7b0db2f
fix: training.py
popiemon Mar 7, 2025
9a90c03
fix: training.py
popiemon Mar 7, 2025
f3c2dc8
Update training.py
popiemon Mar 7, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"name": "semikong",
"dockerComposeFile": "../compose.yaml",
"service": "semikong",
"workspaceFolder": "/workspace",
"customizations": {
"vscode": {
"extensions": [
"GitHub.copilot-chat",
"codezombiech.gitignore",
"GitHub.copilot",
"saoudrizwan.claude-dev",
"njpwerner.autodocstring",
"ms-python.mypy-type-checker",
"ms-python.vscode-pylance",
"ms-python.python",
"ms-python.debugpy",
"ms-python.isort",
"ms-python.black-formatter"
]
}
}
}

4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,7 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

*.lock
results*
data/*
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.10
17 changes: 17 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime

# The installer requires curl (and certificates) to download the release archive
RUN apt-get update && apt-get install -y --no-install-recommends curl=7.81.0-1ubuntu1.20 ca-certificates=20240203~22.04.1 git=1:2.34.1-1ubuntu1.12 vim=2:8.2.3995-1ubuntu2.23 && rm -rf /var/lib/apt/lists/*

# Download the latest installer
ADD https://astral.sh/uv/install.sh /uv-installer.sh

# Run the installer then remove it
RUN sh /uv-installer.sh && rm /uv-installer.sh

# Ensure the installed binary is on the `PATH`
ENV PATH="/opt/conda/bin:$PATH"
ENV UV_PROJECT_ENVIRONMENT="/opt/conda"
ENV UV_SYSTEM_PYTHON=true

WORKDIR /workspace
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,17 @@ You can perform inference with SEMIKONG chat or base models as below.
</p>

### Quick start - Docker
TBA
You shold install Docker and VSCode.

Install the Dev Containers extension.
Please select "Reopen in Container."

You should now be able to work with the SEMIKONG project inside the Docker container using VSCode.

Run the following command to install all packages.
```sh
uv sync --all-extras
```

<p align="right"> [
<a href="#top">Back to top ⬆️ </a> ]
Expand Down
18 changes: 18 additions & 0 deletions compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
services:
semikong:
build:
context: .
dockerfile: Dockerfile
image: semikong
container_name: semikong
tty: true
stdin_open: true
volumes:
- ./:/workspace
working_dir: /workspace
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
5 changes: 4 additions & 1 deletion configs/training-config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
demo: True

model:
model_name: "model_path_folder_or_model_name_hf"
new_model: "semikong-8b"
Expand All @@ -7,7 +9,8 @@ model:
bnb_4bit_quant_type: "nf4"
device_map: {"": 0}
output_dir: "./results"
dataset_name: "dataset_path_folder_or_dataset_name_hf"
dataset_name: /workspace/data/SemiKong_Training_Dataset/train-00000-of-00001.parquet
remove_columns: ["input"]

training:
num_train_epochs: 2
Expand Down
30 changes: 30 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
[project]
name = "semikong"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
authors = [
{ name = "", email = "" }
]
requires-python = ">=3.10"
dependencies = [
"bitsandbytes>=0.45.3",
"datasets>=3.3.2",
"huggingface-hub>=0.29.1",
"loguru>=0.7.3",
"numpy>=2.2.3",
"peft>=0.14.0",
"transformers>=4.49.0",
"trl>=0.15.2",
"vllm>=0.5.0.post1",
]

[dependency-groups]
dev = [
"black>=25.1.0",
"isort>=6.0.1",
"pre-commit>=4.1.0",
"pylint>=3.3.4",
"tqdm>=4.67.1",
]

7 changes: 4 additions & 3 deletions raw_inference.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import argparse
import logging

import torch
import transformers
import yaml
from peft import LoraConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from transformers import (AutoModelForCausalLM, AutoTokenizer,
BitsAndBytesConfig, pipeline)


def load_config(config_file="./configs/inference-config.yaml"):
Expand Down Expand Up @@ -70,7 +71,7 @@ def text_gen_eval_wrapper(model, tokenizer, prompt, max_length=200, temperature=
A wrapper function for inferencing, generating text based on a prompt.
"""
# Suppress logging
logging.set_verbosity(logging.CRITICAL)
transformers.logging.set_verbosity(transformers.logging.CRITICAL)

# Initialize text generation pipeline
pipe = pipeline(
Expand Down
20 changes: 10 additions & 10 deletions training.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import yaml
from datasets import load_dataset
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from transformers import (AutoModelForCausalLM, AutoTokenizer,
BitsAndBytesConfig, TrainingArguments)
from trl import SFTTrainer


Expand Down Expand Up @@ -51,6 +52,7 @@ def configure_lora(config):
r=lora_params["lora_r"],
bias="none",
task_type="CAUSAL_LM",
target_modules=["q_proj", "v_proj"],
)


Expand All @@ -71,7 +73,7 @@ def template_dataset(sample, tokenizer):
def main():
# Parse command-line arguments
parser = argparse.ArgumentParser(description="Fine-tune a model with LoRA and 4-bit precision.")
parser.add_argument("--config", type=str, default="config.yaml", help="Path to the YAML config file.")
parser.add_argument("--config", type=str, default="configs/training-config.yaml", help="Path to the YAML config file.")
args = parser.parse_args()

# Load configuration
Expand All @@ -85,11 +87,12 @@ def main():

# Load and process the dataset
dataset_name = config["model"]["dataset_name"]
dataset = load_dataset("json", data_files=dataset_name, split="train")
extension = dataset_name.split(".")[-1]
dataset = load_dataset(extension, data_files=dataset_name, split="train")
dataset = dataset.shuffle(seed=42)
dataset = dataset.select(range(50)) # Optional: select first 50 rows for demo
dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=list(dataset.features))

if config["demo"]:
dataset = dataset.select(range(50)) # Optional: select first 50 rows for demo
dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=config["model"]["remove_columns"])
# Set up training arguments
training_arguments = TrainingArguments(
output_dir=config["model"]["output_dir"],
Expand All @@ -98,7 +101,7 @@ def main():
optim=config["training"]["optim"],
save_steps=config["training"]["save_steps"],
logging_steps=config["training"]["logging_steps"],
learning_rate=config["training"]["learning_rate"],
learning_rate=float(config["training"]["learning_rate"]),
fp16=config["training"]["fp16"],
bf16=config["training"]["bf16"],
max_grad_norm=config["training"]["max_grad_norm"],
Expand All @@ -113,11 +116,8 @@ def main():
model=model,
train_dataset=dataset,
peft_config=peft_config,
dataset_text_field="text",
max_seq_length=config["training"]["max_seq_length"],
tokenizer=tokenizer,
args=training_arguments,
packing=config["training"]["packing"],
)

trainer.train()
Expand Down