-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsaving8bit.py
More file actions
33 lines (29 loc) · 1.14 KB
/
saving8bit.py
File metadata and controls
33 lines (29 loc) · 1.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
print("Loading 8-bit model...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
def chat(user_input):
prompt = f"User: {user_input}\nAssistant:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_length=150,
temperature=0.7,
repetition_penalty=1.2,
pad_token_id=tokenizer.eos_token_id
)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
if __name__ == "__main__":
print("\nType 'exit' to quit.\n")
while True:
try:
user_input = input("\nYou: ")
if user_input.lower() in ["exit", "quit"]:
break
response = chat(user_input)
print("Model:", response)
except EOFError:
print("\nInput stream closed. Exiting...")
break