Skip to content
This repository was archived by the owner on Sep 18, 2025. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions examples/quick_start/spm_detokenizer_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

from fairseq.data.encoders import build_bpe

from simuleval.utils import entrypoint
from simuleval.agents import TextToTextAgent
from simuleval.agents.actions import ReadAction, WriteAction
from simuleval.agents.pipeline import AgentPipeline
from simuleval.agents.states import AgentStates


@entrypoint
class DummySegmentAgent(TextToTextAgent):
"""
This agent just splits on space
Expand Down Expand Up @@ -70,13 +72,20 @@ def policy(self, states: AgentStates):
possible_full_words = self.spm_processor.decode(
" ".join([x for x in states.source])
)

# issue is when the starting word is in the previous segment
if self.detokenize_only and len(states.source) > 0:
start_word = "▁"
source_text = states.source[0]
states.source = []
if len(possible_full_words) == 0 and not states.source_finished:
return ReadAction()
else:
return WriteAction(possible_full_words, states.source_finished)
incomplete_word = False
if start_word not in source_text[0]:
incomplete_word = True
return WriteAction(
possible_full_words, states.source_finished, incomplete_word
)

if states.source_finished:
return WriteAction(possible_full_words, True)
Expand Down
2 changes: 1 addition & 1 deletion examples/quick_start/spm_source.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
▁Let ' s ▁do ▁it ▁with out ▁hesitation .
▁Let ' s ▁do ▁it ▁with out ▁hesitation .
2 changes: 1 addition & 1 deletion examples/quick_start/spm_target.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
Let's do it without hesitation.
Let's do it without hesitation.
Binary file added examples/quick_start/tokenizer.model
Binary file not shown.
3 changes: 2 additions & 1 deletion simuleval/agents/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

from typing import Union, List
from typing import Optional, Union, List
from dataclasses import dataclass
from simuleval.data.segments import Segment

Expand Down Expand Up @@ -52,6 +52,7 @@ class WriteAction(Action):

content: Union[str, List[float], Segment]
finished: bool
incomplete_word: Optional[bool] = None

def is_read(self) -> bool:
return False
Expand Down
4 changes: 4 additions & 0 deletions simuleval/agents/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,10 @@ def pop(self, states: Optional[AgentStates] = None) -> Segment:
segment = SEGMENT_TYPE_DICT[self.target_type](
index=0, content=action.content, finished=action.finished
)

if isinstance(segment, TextSegment) and action.incomplete_word is not None:
segment.incomplete_word = action.incomplete_word

states.update_target(segment)
return segment

Expand Down
1 change: 1 addition & 0 deletions simuleval/data/segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class EmptySegment(Segment):
class TextSegment(Segment):
content: str = ""
data_type: str = "text"
incomplete_word: bool = None


@dataclass
Expand Down
10 changes: 10 additions & 0 deletions simuleval/evaluator/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,16 @@ def receive_prediction(self, prediction: TextSegment):
else:
raise NotImplementedError

if prediction.incomplete_word:
first_half = self.prediction_list[-1]
second_half = prediction_list[0]
complete_word = first_half + second_half
self.prediction_list.pop()
self.delays.pop()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we also pop the last from self.elapsed? It's similar to self.delays except that it also includes the actual inference time

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup! I have made those changes now

self.elapsed.pop()
prediction_list.pop(0)
prediction_list.insert(0, complete_word)

self.prediction_list += prediction_list

self.elapsed += [self.step_to_elapsed(self.step, current_time)] * len(
Expand Down