from llmlingua import PromptCompressor
llm_lingua = PromptCompressor(
model_name="microsoft/llmlingua-2-xlm-roberta-large-meetingbank",
use_llmlingua2=True,
device_map='cpu',
)
structured_prompt_repo_example ="""<llmlingua, compress=False>Speaker 4:</llmlingua><llmlingua, rate=0.4> Thank you. And can we do the functions for content? </llmlingua><llmlingua, compress=False>
Speaker 0:</llmlingua><llmlingua, rate=0.4> Item 11 is a communication from Council on Price recommendation to increase appropriation in the general fund group </llmlingua><llmlingua, compress=False>
Speaker 4:</llmlingua><llmlingua, rate=0.6> We have a promotion and a second time as councilman served Councilman Ringa and customers and they have any comments.</llmlingua>"""
original_pattern = r"<llmlingua\s*(?:,\s*rate\s*=\s*([\d\.]+))?\s*(?:,\s*compress\s*=\s*(True|False))?\s*(?:,\s*rate\s*=\s*([\d\.]+))?\s*(?:,\s*compress\s*=\s*(True|False))?\s*>([^<]+)</llmlingua>"
matches = re.findall(original_pattern, structured_prompt_repo_example)
print(matches)
# output: [('', 'False', '', '', 'Speaker 4:'), ('0.4', '', '', '', ' Thank you. And can we do the functions for content? '),
# ('', 'False', '', '', '\nSpeaker 0:'), ('0.4', '', '', '', ' Item 11 is a communication from Council on Price recommendation to increase appropriation in the general fund group '),
# ('', 'False', '', '', '\nSpeaker 4:'), ('0.6', '', '', '', ' We have a promotion and a second time as councilman served Councilman Ringa and customers and they have any comments.')]
structured_prompt_with_nested_tags = """<llmlingua, compress=False>Speaker 4:</llmlingua><llmlingua, rate=0.4> Thank you. And can we do the functions for content? </llmlingua><llmlingua, compress=False>
Speaker 0:</llmlingua><llmlingua, rate=0.4> Item 11 is a communication from Council on Price recommendation to increase appropriation in the general fund group </llmlingua><llmlingua, compress=False>
Speaker 4:</llmlingua><llmlingua, rate=0.6> We have a promotion <tag> and a second time as councilman served Councilman Ringa </tag> and customers and they have any comments.</llmlingua>"""
matches = re.findall(original_pattern, structured_prompt_with_nested_tags )
print(matches)
# output: [('', 'False', '', '', 'Speaker 4:'), ('0.4', '', '', '', ' Thank you. And can we do the functions for content? '),
# ('', 'False', '', '', '\nSpeaker 0:'), ('0.4', '', '', '', ' Item 11 is a communication from Council on Price recommendation to increase appropriation in the general fund group '),
# ('', 'False', '', '', '\nSpeaker 4:')]
new_pattern = r"<llmlingua\s*(?:,\s*rate\s*=\s*([\d\.]+))?\s*(?:,\s*compress\s*=\s*(True|False))?\s*(?:,\s*rate\s*=\s*([\d\.]+))?\s*(?:,\s*compress\s*=\s*(True|False))?\s*>((?:[^<]*(?:<(?!/llmlingua>)[^>]*>)?)*?)</llmlingua>"
matches = re.findall(new_pattern, structured_prompt_with_nested_tags)
print(matches)
# output: [('', 'False', '', '', 'Speaker 4:'), ('0.4', '', '', '', ' Thank you. And can we do the functions for content? '),
# ('', 'False', '', '', '\nSpeaker 0:'), ('0.4', '', '', '', ' Item 11 is a communication from Council on Price recommendation to increase appropriation in the general fund group '),
# ('', 'False', '', '', '\nSpeaker 4:'), ('0.6', '', '', '', ' We have a promotion <tag> and a second time as councilman served Councilman Ringa </tag> and customers and they have any comments.')]
The regular expression should handle nested tags within the prompt without disrupting the matching process, treating them as plain text content.
Describe the bug
The original regular expression using
([^<]+)<\llmlingua>would fail when the text inside llmlingua tags contained other tags (like<tag>...</tag>).I suggest to replace it with something like
((?:[^<]*(?:<(?!/llmlingua>)[^>]*>)?)*?)</llmlingua>.Commit on my fork: 73baf3f
Steps to reproduce
Confronting pattern matching:
Confronting "compressions" with/without nested non-llmlingua-related tags:
Expected Behavior
The regular expression should handle nested tags within the prompt without disrupting the matching process, treating them as plain text content.
Logs
No response
Additional Information
No response