Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ python -m eagle.application.webui --ea-model-path [path of EAGLE weight]\
The *total-token* is the number of draft tokens. For smaller models and advanced GPUs, this value can be set larger. Adjusting according to the specific device and model can achieve better results. If set to -1, EAGLE-2 will automatically configure this parameter.

### With Code
You can use our provided "eagenerate" for speedup generation just like using 'generate' from Hugging Face. Here is an example.
You can use our provided "ea_generate" for speedup generation just like using 'generate' from Hugging Face. Here is an example.
```python
from eagle.model.ea_model import EaModel
from fastchat.model import get_conversation_template
Expand All @@ -197,7 +197,7 @@ conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids=model.tokenizer([prompt]).input_ids
input_ids = torch.as_tensor(input_ids).cuda()
output_ids=model.eagenerate(input_ids,temperature=0.5,max_new_tokens=512)
output_ids=model.ea_generate(input_ids,temperature=0.5,max_new_tokens=512)
output=model.tokenizer.decode(output_ids[0])
```

Expand Down
6 changes: 3 additions & 3 deletions eagle/application/webui.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def warmup(model):
prompt += " "
input_ids = model.tokenizer([prompt]).input_ids
input_ids = torch.as_tensor(input_ids).cuda()
for output_ids in model.ea_generate(input_ids):
for output_ids in model.ea_generate(input_ids, streaming=True):
ol=output_ids.shape[1]

def bot(history, temperature, top_p, use_EaInfer, highlight_EaInfer,session_state,):
Expand Down Expand Up @@ -154,7 +154,7 @@ def bot(history, temperature, top_p, use_EaInfer, highlight_EaInfer,session_stat
if use_EaInfer:

for output_ids in model.ea_generate(input_ids, temperature=temperature, top_p=top_p,
max_new_tokens=args.max_new_token,is_llama3=args.model_type=="llama-3-instruct"):
max_new_tokens=args.max_new_token,is_llama3=args.model_type=="llama-3-instruct",streaming=True):
totaltime+=(time.time()-start_time)
total_ids+=1
decode_ids = output_ids[0, input_len:].tolist()
Expand Down Expand Up @@ -183,7 +183,7 @@ def bot(history, temperature, top_p, use_EaInfer, highlight_EaInfer,session_stat

else:
for output_ids in model.naive_generate(input_ids, temperature=temperature, top_p=top_p,
max_new_tokens=args.max_new_token,is_llama3=args.model_type=="llama-3-instruct"):
max_new_tokens=args.max_new_token,is_llama3=args.model_type=="llama-3-instruct",streaming=True):
totaltime += (time.time() - start_time)
total_ids+=1
decode_ids = output_ids[0, input_len:].tolist()
Expand Down
4 changes: 2 additions & 2 deletions eagle/evaluation/gen_baseline_answer_llama3chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def get_model_answers(
torch.cuda.synchronize()
start_time = time.time()

output_ids, new_token, idx = model.naivegenerate(
output_ids, new_token, idx = model.naive_generate(
torch.as_tensor(input_ids).cuda(),
temperature=temperature,
log=True,
Expand Down Expand Up @@ -240,7 +240,7 @@ def get_model_answers(
torch.cuda.synchronize()
start_time = time.time()

output_ids, new_token, idx = model.naivegenerate(
output_ids, new_token, idx = model.naive_generate(
torch.as_tensor(input_ids).cuda(),
temperature=temperature,
log=True,
Expand Down
4 changes: 2 additions & 2 deletions eagle/evaluation/gen_ea_answer_ds.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def get_model_answers(
torch.cuda.synchronize()
start_time = time.time()

output_ids, new_token, idx = model.eagenerate(
output_ids, new_token, idx = model.ea_generate(
torch.as_tensor(input_ids).cuda(),
temperature=temperature,
log=True,
Expand Down Expand Up @@ -234,7 +234,7 @@ def get_model_answers(
torch.cuda.synchronize()
start_time = time.time()

output_ids, new_token, idx = model.eagenerate(
output_ids, new_token, idx = model.ea_generate(
torch.as_tensor(input_ids).cuda(),
temperature=temperature,
log=True,
Expand Down
4 changes: 2 additions & 2 deletions eagle/evaluation/gen_ea_answer_llama2chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def get_model_answers(
torch.cuda.synchronize()
start_time = time.time()

output_ids, new_token, idx = model.eagenerate(
output_ids, new_token, idx = model.ea_generate(
torch.as_tensor(input_ids).cuda(),
temperature=temperature,
log=True
Expand Down Expand Up @@ -210,7 +210,7 @@ def get_model_answers(

torch.cuda.synchronize()
start_time = time.time()
output_ids, new_token, idx = model.eagenerate(
output_ids, new_token, idx = model.ea_generate(
torch.as_tensor(input_ids).cuda(),
temperature=temperature,
log=True
Expand Down
4 changes: 2 additions & 2 deletions eagle/evaluation/gen_ea_answer_llama3chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def get_model_answers(
torch.cuda.synchronize()
start_time = time.time()

output_ids, new_token, idx = model.eagenerate(
output_ids, new_token, idx = model.ea_generate(
torch.as_tensor(input_ids).cuda(),
temperature=temperature,
log=True,
Expand Down Expand Up @@ -241,7 +241,7 @@ def get_model_answers(
torch.cuda.synchronize()
start_time = time.time()

output_ids, new_token, idx = model.eagenerate(
output_ids, new_token, idx = model.ea_generate(
torch.as_tensor(input_ids).cuda(),
temperature=temperature,
log=True,
Expand Down
4 changes: 2 additions & 2 deletions eagle/evaluation/gen_ea_answer_mix.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def get_model_answers(
torch.cuda.synchronize()
start_time = time.time()

output_ids, new_token, idx = model.eagenerate(
output_ids, new_token, idx = model.ea_generate(
torch.as_tensor(input_ids).cuda(),
temperature=temperature,
log=True
Expand Down Expand Up @@ -211,7 +211,7 @@ def get_model_answers(
try:
torch.cuda.synchronize()
start_time = time.time()
output_ids, new_token, idx = model.eagenerate(
output_ids, new_token, idx = model.ea_generate(
torch.as_tensor(input_ids).cuda(),
temperature=temperature,
log=True
Expand Down
4 changes: 2 additions & 2 deletions eagle/evaluation/gen_ea_answer_vicuna.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def get_model_answers(
torch.cuda.synchronize()
start_time = time.time()

output_ids, new_token, idx = model.eagenerate(
output_ids, new_token, idx = model.ea_generate(
torch.as_tensor(input_ids).cuda(),
temperature=temperature,
log=True
Expand Down Expand Up @@ -212,7 +212,7 @@ def get_model_answers(

torch.cuda.synchronize()
start_time = time.time()
output_ids, new_token, idx = model.eagenerate(
output_ids, new_token, idx = model.ea_generate(
torch.as_tensor(input_ids).cuda(),
temperature=temperature,
log=True
Expand Down
Loading