SafeAILab · tonylt · Jul 8, 2025
diff --git a/README.md b/README.md
@@ -177,7 +177,7 @@ python -m eagle.application.webui --ea-model-path [path of EAGLE weight]\
 The *total-token* is the number of draft tokens. For smaller models and advanced GPUs, this value can be set larger. Adjusting according to the specific device and model can achieve better results. If set to -1, EAGLE-2 will automatically configure this parameter.
 
 ### With Code
-You can use our provided "eagenerate" for speedup generation just like using 'generate' from Hugging Face. Here is an example.
+You can use our provided "ea_generate" for speedup generation just like using 'generate' from Hugging Face. Here is an example.
 ```python
 from eagle.model.ea_model import EaModel
 from fastchat.model import get_conversation_template
@@ -197,7 +197,7 @@ conv.append_message(conv.roles[1], None)
 prompt = conv.get_prompt()
 input_ids=model.tokenizer([prompt]).input_ids
 input_ids = torch.as_tensor(input_ids).cuda()
-output_ids=model.eagenerate(input_ids,temperature=0.5,max_new_tokens=512)
+output_ids=model.ea_generate(input_ids,temperature=0.5,max_new_tokens=512)
 output=model.tokenizer.decode(output_ids[0])
 ```
 

diff --git a/eagle/application/webui.py b/eagle/application/webui.py
@@ -91,7 +91,7 @@ def warmup(model):
         prompt += " "
     input_ids = model.tokenizer([prompt]).input_ids
     input_ids = torch.as_tensor(input_ids).cuda()
-    for output_ids in model.ea_generate(input_ids):
+    for output_ids in model.ea_generate(input_ids, streaming=True):
         ol=output_ids.shape[1]
 
 def bot(history, temperature, top_p, use_EaInfer, highlight_EaInfer,session_state,):
@@ -154,7 +154,7 @@ def bot(history, temperature, top_p, use_EaInfer, highlight_EaInfer,session_stat
     if use_EaInfer:
 
         for output_ids in model.ea_generate(input_ids, temperature=temperature, top_p=top_p,
-                                            max_new_tokens=args.max_new_token,is_llama3=args.model_type=="llama-3-instruct"):
+                                            max_new_tokens=args.max_new_token,is_llama3=args.model_type=="llama-3-instruct",streaming=True):
             totaltime+=(time.time()-start_time)
             total_ids+=1
             decode_ids = output_ids[0, input_len:].tolist()
@@ -183,7 +183,7 @@ def bot(history, temperature, top_p, use_EaInfer, highlight_EaInfer,session_stat
 
     else:
         for output_ids in model.naive_generate(input_ids, temperature=temperature, top_p=top_p,
-                                            max_new_tokens=args.max_new_token,is_llama3=args.model_type=="llama-3-instruct"):
+                                            max_new_tokens=args.max_new_token,is_llama3=args.model_type=="llama-3-instruct",streaming=True):
             totaltime += (time.time() - start_time)
             total_ids+=1
             decode_ids = output_ids[0, input_len:].tolist()

diff --git a/eagle/evaluation/gen_baseline_answer_llama3chat.py b/eagle/evaluation/gen_baseline_answer_llama3chat.py
@@ -158,7 +158,7 @@ def get_model_answers(
             torch.cuda.synchronize()
             start_time = time.time()
 
-            output_ids, new_token, idx = model.naivegenerate(
+            output_ids, new_token, idx = model.naive_generate(
                 torch.as_tensor(input_ids).cuda(),
                 temperature=temperature,
                 log=True,
@@ -240,7 +240,7 @@ def get_model_answers(
                 torch.cuda.synchronize()
                 start_time = time.time()
 
-                output_ids, new_token, idx = model.naivegenerate(
+                output_ids, new_token, idx = model.naive_generate(
                     torch.as_tensor(input_ids).cuda(),
                     temperature=temperature,
                     log=True,

diff --git a/eagle/evaluation/gen_ea_answer_ds.py b/eagle/evaluation/gen_ea_answer_ds.py
@@ -155,7 +155,7 @@ def get_model_answers(
             torch.cuda.synchronize()
             start_time = time.time()
 
-            output_ids, new_token, idx = model.eagenerate(
+            output_ids, new_token, idx = model.ea_generate(
                 torch.as_tensor(input_ids).cuda(),
                 temperature=temperature,
                 log=True,
@@ -234,7 +234,7 @@ def get_model_answers(
                 torch.cuda.synchronize()
                 start_time = time.time()
 
-                output_ids, new_token, idx = model.eagenerate(
+                output_ids, new_token, idx = model.ea_generate(
                     torch.as_tensor(input_ids).cuda(),
                     temperature=temperature,
                     log=True,

diff --git a/eagle/evaluation/gen_ea_answer_llama2chat.py b/eagle/evaluation/gen_ea_answer_llama2chat.py
@@ -144,7 +144,7 @@ def get_model_answers(
             torch.cuda.synchronize()
             start_time = time.time()
 
-            output_ids, new_token, idx = model.eagenerate(
+            output_ids, new_token, idx = model.ea_generate(
                 torch.as_tensor(input_ids).cuda(),
                 temperature=temperature,
                 log=True
@@ -210,7 +210,7 @@ def get_model_answers(
 
                 torch.cuda.synchronize()
                 start_time = time.time()
-                output_ids, new_token, idx = model.eagenerate(
+                output_ids, new_token, idx = model.ea_generate(
                     torch.as_tensor(input_ids).cuda(),
                     temperature=temperature,
                     log=True

diff --git a/eagle/evaluation/gen_ea_answer_llama3chat.py b/eagle/evaluation/gen_ea_answer_llama3chat.py
@@ -159,7 +159,7 @@ def get_model_answers(
             torch.cuda.synchronize()
             start_time = time.time()
 
-            output_ids, new_token, idx = model.eagenerate(
+            output_ids, new_token, idx = model.ea_generate(
                 torch.as_tensor(input_ids).cuda(),
                 temperature=temperature,
                 log=True,
@@ -241,7 +241,7 @@ def get_model_answers(
                 torch.cuda.synchronize()
                 start_time = time.time()
 
-                output_ids, new_token, idx = model.eagenerate(
+                output_ids, new_token, idx = model.ea_generate(
                     torch.as_tensor(input_ids).cuda(),
                     temperature=temperature,
                     log=True,

diff --git a/eagle/evaluation/gen_ea_answer_mix.py b/eagle/evaluation/gen_ea_answer_mix.py
@@ -145,7 +145,7 @@ def get_model_answers(
             torch.cuda.synchronize()
             start_time = time.time()
 
-            output_ids, new_token, idx = model.eagenerate(
+            output_ids, new_token, idx = model.ea_generate(
                 torch.as_tensor(input_ids).cuda(),
                 temperature=temperature,
                 log=True
@@ -211,7 +211,7 @@ def get_model_answers(
                 try:
                     torch.cuda.synchronize()
                     start_time = time.time()
-                    output_ids, new_token, idx = model.eagenerate(
+                    output_ids, new_token, idx = model.ea_generate(
                         torch.as_tensor(input_ids).cuda(),
                         temperature=temperature,
                         log=True

diff --git a/eagle/evaluation/gen_ea_answer_vicuna.py b/eagle/evaluation/gen_ea_answer_vicuna.py
@@ -148,7 +148,7 @@ def get_model_answers(
             torch.cuda.synchronize()
             start_time = time.time()
 
-            output_ids, new_token, idx = model.eagenerate(
+            output_ids, new_token, idx = model.ea_generate(
                 torch.as_tensor(input_ids).cuda(),
                 temperature=temperature,
                 log=True
@@ -212,7 +212,7 @@ def get_model_answers(
 
                 torch.cuda.synchronize()
                 start_time = time.time()
-                output_ids, new_token, idx = model.eagenerate(
+                output_ids, new_token, idx = model.ea_generate(
                     torch.as_tensor(input_ids).cuda(),
                     temperature=temperature,
                     log=True