diff --git a/README.md b/README.md index b06b6e0..02ea939 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ from open_autonlu.methods.data_types import SaveFormat pipeline = TextClassificationTrainingPipeline( train_path="train.csv", test_path="test.csv", - config_overrides={"language": "en"} # "en" or "ru" + config_overrides={"language": "en"} # for non-en/ru also set "model_name_or_path" ) result = pipeline.train() pipeline.save("./model", SaveFormat.ONNX) @@ -82,7 +82,7 @@ pipeline.save("./model", SaveFormat.ONNX) pipeline = TokenClassificationTrainingPipeline( train_path="train.json", test_path="test.json", - config_overrides={"language": "en"} # "en" or "ru" + config_overrides={"language": "en"} # for non-en/ru also set "model_name_or_path" ) result = pipeline.train() pipeline.save("./model", SaveFormat.ONNX) @@ -140,7 +140,7 @@ from open_autonlu.methods.data_types import OodMethod, SaveFormat pipeline = TextClassificationTrainingPipeline( train_path="train.csv", config_overrides={ - "language": "en", # Prompt language for LLM pipelines ("en" or "ru") + "language": "en", # for non-en/ru also set "model_name_or_path" "ood_method": OodMethod.LOGIT, # OOD detection method "batch_size": 32, # Batch size } @@ -175,7 +175,7 @@ config_overrides = { ### LLM Data Augmentation -Automatically augment underrepresented classes using LLM generation. The `language` parameter controls which prompts are sent to the LLM (`"en"` for English, `"ru"` for Russian). +Automatically augment underrepresented classes using LLM generation. The `language` parameter controls which prompts are sent to the LLM (`"en"` for English, `"ru"` for Russian). For other languages, English prompts are used with an instruction to generate text in the language of the provided examples. ```python import os @@ -259,6 +259,40 @@ config_overrides = { } ``` +## Multilingual Support + +The pipeline has been tested on **English (en), Russian (ru), French (fr), Chinese (zh), Arabic (ar), and Hindi (hi)**. Correct tokenization and NER behavior is guaranteed for these languages. Other languages are also supported but have not been explicitly validated. + +### Model selection for non-default languages + +Default models are only available for English (`bert-base-uncased`) and Russian (`ai-forever/ruBert-base`). For any other language you **must** set `model_name_or_path` in `config_overrides`: + +```python +pipeline = TextClassificationTrainingPipeline( + train_path="train.csv", + config_overrides={ + "language": "fr", + "model_name_or_path": "MODEL_NAME", + } +) +``` + +Any HuggingFace checkpoint that supports your target language can be used. + +### AncSetFit template + +When the pipeline selects AncSetFit (2-5 examples per class), it prepends a `template` string to each `anc_label` to form anchor sentences. Default templates exist only for English and Russian. For other languages a custom `template` **must** be provided, otherwise the pipeline will raise an error. Even for English/Russian, setting a domain-specific template is recommended for best results: + +```python +config_overrides={ + "language": "fr", + "model_name_or_path": "camembert-base", + "AncSetFitMethod": { + "template": "User asks the bot to perform a request using the skill: ", # write in your target language + } +} +``` + ## Data Formats ### Text Classification (CSV) diff --git a/app.py b/app.py index 69e0db8..1dae154 100644 --- a/app.py +++ b/app.py @@ -763,13 +763,12 @@ def apply_dq_filter_ner(): or "labels" not in train_ds.column_names ): return - language = st.session_state.get("language", "en") records = [] for row in train_ds: text = row["text"] tokens = row["tokens"] labels = row["labels"] - spans = convert_bio_to_spans(text, tokens, labels, language=language) + spans = convert_bio_to_spans(text, tokens, labels) records.append({"text": text, "spans": spans}) elif indices and st.session_state.ner_train_data: records = [ diff --git a/open_autonlu/data/ner_data_provider.py b/open_autonlu/data/ner_data_provider.py index 9494934..9aa1353 100644 --- a/open_autonlu/data/ner_data_provider.py +++ b/open_autonlu/data/ner_data_provider.py @@ -14,36 +14,28 @@ def _load_data(self) -> DatasetDict: if path := getattr(self, f"{split_name}_path"): with open(path) as f: records = json.load(f) - data_set = [] - if "text" in records[0]: - if "spans" in records[0]: - for record in records: - tokens, bio_tags = convert_offsets_to_bio( - record["text"], - record["spans"], - language=self.language, - ) - data_set.append( - { - "text": record["text"], - "tokens": tokens, - "labels": bio_tags, - } - ) - else: - for record in records: - tokens, bio_tags = self.from_brackets(record["text"]) - data_set.append( - { - "text": " ".join(tokens), - "tokens": tokens, - "labels": bio_tags, - } - ) - - splits[split_name] = Dataset.from_list(data_set) + splits[split_name] = Dataset.from_list( + [self._parse_record(r) for r in records] + ) return DatasetDict(splits) + def _parse_record(self, record: dict) -> dict: + if "tokens" in record and "labels" in record: + tokens, bio_tags = record["tokens"], record["labels"] + text = record.get("text", " ".join(tokens)) + elif "text" in record and "spans" in record: + tokens, bio_tags = convert_offsets_to_bio(record["text"], record["spans"]) + text = record["text"] + elif "text" in record: + tokens, bio_tags = self.from_brackets(record["text"]) + text = " ".join(tokens) + else: + raise ValueError( + "Record must contain either 'tokens'+'labels', " + "'text'+'spans', or bracket format." + ) + return {"text": text, "tokens": tokens, "labels": bio_tags} + @staticmethod def from_brackets(text: str) -> List[str]: """ diff --git a/open_autonlu/data/utils.py b/open_autonlu/data/utils.py index 5fde60d..ba65c4f 100644 --- a/open_autonlu/data/utils.py +++ b/open_autonlu/data/utils.py @@ -1,17 +1,30 @@ +import re +import unicodedata from typing import NamedTuple -import spacy from datasets import Dataset -SUPPORTED_LANGUAGES = ("ru", "en") +_CJK_RANGES = ( + (0x4E00, 0x9FFF), + (0x3400, 0x4DBF), + (0x20000, 0x2A6DF), + (0x2A700, 0x2B73F), + (0x2B740, 0x2B81F), + (0x2B820, 0x2CEAF), + (0xF900, 0xFAFF), + (0x2F800, 0x2FA1F), +) -_spacy_models: dict[str, spacy.language.Language] = {} +_TOKEN_RE = re.compile(r"\w+(?:-\w+)*|\S", re.UNICODE) -def _get_spacy_model(language: str) -> spacy.language.Language: - if language not in _spacy_models: - _spacy_models[language] = spacy.blank(language) - return _spacy_models[language] +def _is_cjk(char: str) -> bool: + cp = ord(char) + return any(lo <= cp <= hi for lo, hi in _CJK_RANGES) + + +def _is_mark(char: str) -> bool: + return unicodedata.category(char)[0] == "M" class Token(NamedTuple): @@ -20,25 +33,59 @@ class Token(NamedTuple): text: str -def tokenize_with_offsets(text: str, language: str = "en") -> list[Token]: - """Tokenize text and return list of (start, stop, text) for each token.""" - if language not in SUPPORTED_LANGUAGES: - raise ValueError( - f"Unsupported language: '{language}'. Supported: {SUPPORTED_LANGUAGES}" - ) - - nlp = _get_spacy_model(language) - doc = nlp(text) - return [ - Token( - start=token.idx if token.idx is not None else 0, - stop=token.idx + len(token.text) - if token.idx is not None - else len(token.text), - text=token.text, - ) - for token in doc - ] +def tokenize_with_offsets(text: str) -> list[Token]: + """Tokenize text into words/punctuation and return character offsets. + + Uses a Unicode-aware regex. CJK characters are split individually since they have no whitespace word boundaries. + Combining marks are merged back into the preceding token. + """ + raw: list[Token] = [] + for m in _TOKEN_RE.finditer(text): + word = m.group() + offset = m.start() + if len(word) > 1 and any(_is_cjk(c) for c in word): + buf_start: int | None = None + for i, ch in enumerate(word): + if _is_cjk(ch): + if buf_start is not None: + raw.append( + Token( + start=offset + buf_start, + stop=offset + i, + text=word[buf_start:i], + ) + ) + buf_start = None + raw.append(Token(start=offset + i, stop=offset + i + 1, text=ch)) + else: + if buf_start is None: + buf_start = i + if buf_start is not None: + raw.append( + Token( + start=offset + buf_start, + stop=offset + len(word), + text=word[buf_start:], + ) + ) + else: + raw.append(Token(start=offset, stop=m.end(), text=word)) + + if not raw: + return raw + + merged: list[Token] = [raw[0]] + for tok in raw[1:]: + prev = merged[-1] + if tok.start == prev.stop and ( + _is_mark(tok.text[0]) or _is_mark(prev.text[-1]) + ): + merged[-1] = Token( + start=prev.start, stop=tok.stop, text=text[prev.start : tok.stop] + ) + else: + merged.append(tok) + return merged def align_labels_with_tokens(labels, word_ids, b_to_i_label, label_all_tokens=True): @@ -69,9 +116,8 @@ def convert_offsets_to_bio( text: str, labels: list, label_key: str = "label", - language: str = "en", ) -> tuple[list[str], list[str]]: - substrings = tokenize_with_offsets(text, language=language) + substrings = tokenize_with_offsets(text) tokens: list[str] = [] bio_tags: list[str] = [] diff --git a/open_autonlu/llm_pipelines/prompts.py b/open_autonlu/llm_pipelines/prompts.py index b715bf1..71b01f3 100644 --- a/open_autonlu/llm_pipelines/prompts.py +++ b/open_autonlu/llm_pipelines/prompts.py @@ -1,7 +1,13 @@ +import logging from types import SimpleNamespace import outlines +log = logging.getLogger(__name__) + +SUPPORTED_PROMPT_LANGUAGES = frozenset({"en", "ru"}) +FALLBACK_PROMPT_LANGUAGE = "en" + # --------------------------------------------------------------------------- # English prompts # --------------------------------------------------------------------------- @@ -40,8 +46,15 @@ def generate_artificial_data(topic, texts, data_size): You should provide {{ data_size }} texts.""" +LANGUAGE_OF_EXAMPLES_INSTRUCTION = ( + "Generate all texts in the same language as the provided examples." +) + + @outlines.prompt -def generate_texts(topic, texts, data_size, domain_desc=None, label_desc=None): +def generate_texts( + topic, texts, data_size, domain_desc=None, label_desc=None, extra_instruction=None +): """### TASK Generate EXACTLY {{ data_size }} new unique texts that belong to the category "{{ topic }}", preserve the characteristic linguistic features of this category, but contain new, original content. {% if domain_desc %} @@ -67,6 +80,9 @@ def generate_texts(topic, texts, data_size, domain_desc=None, label_desc=None): 5. Preserve the stylistic features and emotional tone typical of the category 6. Preserve the punctuation and formatting features characteristic of the category (for example, if punctuation is absent in the examples, this indicates that the generated texts should also follow this punctuation pattern) 7. Generate EXACTLY {{ data_size }} texts — no more and no fewer + {% if extra_instruction %} + 8. {{ extra_instruction }} + {% endif %} IMPORTANT: Your goal is not to mechanically change individual words in examples, but to create new texts that could organically fit into the corpus of texts of this category, preserving their stylistic and structural features. @@ -255,11 +271,19 @@ def analyze_domain_ru(examples_by_label, label_names): # --------------------------------------------------------------------------- +def _generate_texts_with_language_instruction(*args, **kwargs): + kwargs.setdefault("extra_instruction", LANGUAGE_OF_EXAMPLES_INSTRUCTION) + return generate_texts(*args, **kwargs) + + def get_prompts(language: str = "en") -> SimpleNamespace: """Return a namespace of prompts for the given language. Args: - language: Language code ("en" or "ru"). + language: Language code. Only "en" and "ru" have dedicated prompts. + Any other language falls back to English prompts. + In fallback mode, the generation prompt includes an instruction to + generate texts in the same language as the provided examples. Returns: SimpleNamespace with attributes: @@ -268,6 +292,15 @@ def get_prompts(language: str = "en") -> SimpleNamespace: label_prefix, default_label_desc_template, domain_description_header, label_descriptions_header """ + requested_language = language + if language not in SUPPORTED_PROMPT_LANGUAGES: + log.warning( + "No prompts for language '%s'. Falling back to '%s' for data generation.", + language, + FALLBACK_PROMPT_LANGUAGE, + ) + language = FALLBACK_PROMPT_LANGUAGE + if language == "ru": return SimpleNamespace( default_system_prompt=DEFAULT_SYSTEM_PROMPT_RU, @@ -285,10 +318,15 @@ def get_prompts(language: str = "en") -> SimpleNamespace: label_descriptions_header="ОПИСАНИЯ МЕТОК:", ) + generate_texts_fn = ( + _generate_texts_with_language_instruction + if requested_language not in SUPPORTED_PROMPT_LANGUAGES + else generate_texts + ) return SimpleNamespace( default_system_prompt=DEFAULT_SYSTEM_PROMPT, analyzer_system_prompt=ANALYZER_SYSTEM_PROMPT, - generate_texts=generate_texts, + generate_texts=generate_texts_fn, analyze_domain=analyze_domain, generate_artificial_data=generate_artificial_data, label_prefix="LABEL", diff --git a/open_autonlu/methods/configs/anc_setfit_config.py b/open_autonlu/methods/configs/anc_setfit_config.py index 8bed6d0..74dd295 100644 --- a/open_autonlu/methods/configs/anc_setfit_config.py +++ b/open_autonlu/methods/configs/anc_setfit_config.py @@ -1,9 +1,12 @@ +import logging from dataclasses import dataclass from typing import Optional from ..constants import DEFAULT_TEMPLATE_BY_LANGUAGE from .setfit_config import SetFitMethodConfig +log = logging.getLogger(__name__) + @dataclass class AncSetFitConfig(SetFitMethodConfig): @@ -34,7 +37,8 @@ class AncSetFitConfig(SetFitMethodConfig): epochs: Number of training epochs. body_lr: Learning rate for the sentence transformer body. template: Prompt template prepended to anc_label to form anchors. - If None, resolved from language by DEFAULT_TEMPLATE_BY_LANGUAGE. + If None, resolved from language by DEFAULT_TEMPLATE_BY_LANGUAGE (ru/en). + For other languages a custom template must be provided. """ margin: float = 0.25 @@ -45,6 +49,20 @@ class AncSetFitConfig(SetFitMethodConfig): template: Optional[str] = None def __post_init__(self) -> None: - super().__post_init__() if self.template is None: + if self.language not in DEFAULT_TEMPLATE_BY_LANGUAGE: + raise ValueError( + f"Language '{self.language}' requires a custom 'template'. " + f"Default templates are only available for: " + f"{', '.join(DEFAULT_TEMPLATE_BY_LANGUAGE)}. " + f"Set 'template' in config_overrides when creating the pipeline." + ) self.template = DEFAULT_TEMPLATE_BY_LANGUAGE[self.language] + + super().__post_init__() + + if self.template in DEFAULT_TEMPLATE_BY_LANGUAGE.values(): + log.warning( + "Using the default AncSetFit template. " + "For best results, set a custom 'template' in config_overrides when creating the pipeline." + ) diff --git a/open_autonlu/methods/configs/base_config.py b/open_autonlu/methods/configs/base_config.py index 4a0e660..67acf83 100644 --- a/open_autonlu/methods/configs/base_config.py +++ b/open_autonlu/methods/configs/base_config.py @@ -12,8 +12,9 @@ class BaseMethodConfig: Attributes: model_name_or_path: HuggingFace model identifier or local path. If None, resolved from language by DEFAULT_MODEL_BY_LANGUAGE. - language: Language("en" or "ru"). Determines default model when - model_name_or_path is not set: en - bert-base-uncased, ru - ai-forever/ruBert-base. + language: Language code (e.g. "en", "ru"). If model_name_or_path is not set, + a default model is used only for "en" and "ru"; for other languages + you must set model_name_or_path. tokenizer_kwargs: Additional arguments passed to the tokenizer. max_seq_length: Maximum sequence length for tokenization. dev_fraction: Fraction of training data to hold out as a dev set @@ -28,6 +29,12 @@ class BaseMethodConfig: def __post_init__(self) -> None: if self.model_name_or_path is None: + if self.language not in DEFAULT_MODEL_BY_LANGUAGE: + raise ValueError( + f"Language '{self.language}' has no default model. " + f"Set 'model_name_or_path' in config_overrides to a checkpoint that supports this language " + f"Default models are only available for: {', '.join(DEFAULT_MODEL_BY_LANGUAGE)}." + ) self.model_name_or_path = DEFAULT_MODEL_BY_LANGUAGE[self.language] diff --git a/open_autonlu/methods/token_classification_finetuner.py b/open_autonlu/methods/token_classification_finetuner.py index 60619f4..d1abf56 100644 --- a/open_autonlu/methods/token_classification_finetuner.py +++ b/open_autonlu/methods/token_classification_finetuner.py @@ -5,6 +5,7 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union import numpy as np +import torch from accelerate.utils import find_executable_batch_size from datasets import Dataset, DatasetDict from optimum.onnxruntime import ORTModelForTokenClassification @@ -350,10 +351,16 @@ def test(self, dataset: Dataset) -> ClassificationEvaluationResult: Returns a classification_report and a confusion_matrix """ true_labels = dataset["labels"] - predictions = self.predict(dataset["text"], output_format=NERFormat.BIO) - pred_labels = [] - for pred in predictions: - pred_labels.append(pred.labels) + pred_labels = find_executable_batch_size( + self._predict_from_tokens, starting_batch_size=BATCH_SIZE + )(dataset["tokens"]) + aligned_true, aligned_pred = [], [] + for t, p in zip(true_labels, pred_labels): + n = min(len(t), len(p)) + aligned_true.append(t[:n]) + aligned_pred.append(p[:n]) + true_labels = aligned_true + pred_labels = aligned_pred clf_report, confusion_matrix = evaluate_entity_level( true_labels, pred_labels, self.entity_names ) @@ -436,6 +443,51 @@ def predict( self._predict, starting_batch_size=batch_size )(texts, output_format=output_format) + def _predict_from_tokens( + self, batch_size: int, tokens_list: List[List[str]] + ) -> List[List[str]]: + """Run NER inference on pre-tokenized inputs (one BIO label per token). + + Uses the same word-level tokenization as training (is_split_into_words=True), + so evaluation aligns with gold labels. + """ + + self.model.eval() + all_pred_labels: List[List[str]] = [] + for start in range(0, len(tokens_list), batch_size): + batch_tokens = tokens_list[start : start + batch_size] + enc = self.tokenizer( + batch_tokens, + is_split_into_words=True, + truncation=True, + padding=True, + return_tensors="pt", + ) + model_inputs = { + k: v.to(self.model.device) if hasattr(v, "to") else v + for k, v in enc.items() + } + with torch.no_grad(): + out = self.model(**model_inputs) + logits = out.logits + pred_ids = logits.argmax(dim=-1).cpu().tolist() + + for i in range(len(batch_tokens)): + try: + word_ids = enc.word_ids(batch_index=i) + except TypeError: + word_ids = enc.word_ids(i) + labels_i: List[str] = [] + last_wid: Optional[int] = None + for j, wid in enumerate(word_ids): + if wid is None: + continue + if wid != last_wid: + labels_i.append(self.id2label[pred_ids[i][j]]) + last_wid = wid + all_pred_labels.append(labels_i) + return all_pred_labels + def _predict( self, batch_size: int, diff --git a/pyproject.toml b/pyproject.toml index c401195..ae33f55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,6 @@ dependencies = [ "python-fire==0.1.0", "python-dotenv==1.1.0", "skl2onnx>=1.17.0", - "spacy>=3.0.0", "iterative-stratification==0.1.9", "onnxruntime==1.19.2 ; sys_platform == 'darwin' or (sys_platform == 'linux' and platform_machine != 'x86_64')", "onnxruntime-gpu==1.19.2 ; sys_platform == 'linux' and platform_machine == 'x86_64'", diff --git a/tests/data/ar_ner/test.json b/tests/data/ar_ner/test.json new file mode 100644 index 0000000..1106f9a --- /dev/null +++ b/tests/data/ar_ner/test.json @@ -0,0 +1,62 @@ +[ + { + "text": "صحيني [time : خمسة الفجر] [date : هذا الأسبوع]" + }, + { + "text": "بس نبي [color_type : اللون الوردي]" + }, + { + "text": "طفي الاضواء [house_place : بغرفة النوم] حقي" + }, + { + "text": "[house_place : ممر]" + }, + { + "text": "كم الوقت في [place_name : السعودية]" + }, + { + "text": "قل لي أولي [time_zone : توقيت الرياض] زائد خمسة" + }, + { + "text": "كنسل منبه [time : سبعة الصبح]" + }, + { + "text": "تعجبني أغاني [artist_name : رابح]" + }, + { + "text": "قل لي تاريخ [date : اليوم]" + }, + { + "text": "كم التاريخ [date : اليوم]" + }, + { + "text": "عادي تطلب [food_type : sushi] عشان [meal_type : عشا] [timeofday : الليلة]" + }, + { + "text": "ما هي توقعات [time : الاسبوع]" + }, + { + "text": "ما هو الطقس [date : هذا الأسبوع]" + }, + { + "text": "قل لي الطقس [date : هذا الاسبوع]" + }, + { + "text": "اطلب [order_type : سفري]" + }, + { + "text": "[order_type : وجبات جاهزة]" + }, + { + "text": "يقدرون [order_type : يوصلون]" + }, + { + "text": "[order_type : يوصل]" + }, + { + "text": "[order_type : توصيل]" + }, + { + "text": "حط منبه يصحيني [time : سبعة الصبح] يوم [date : الخميس]" + } +] \ No newline at end of file diff --git a/tests/data/ar_ner/train.json b/tests/data/ar_ner/train.json new file mode 100644 index 0000000..feeab41 --- /dev/null +++ b/tests/data/ar_ner/train.json @@ -0,0 +1,302 @@ +[ + { + "text": "صحيني [time : تسعة الصباح] يوم [date : الجمعة]" + }, + { + "text": "حط منبه [time : بعد ساعتين من الحين]" + }, + { + "text": "أوللي وقف مؤقتاً لمدة [time : عشر ثواني]" + }, + { + "text": "توقف مؤقتا لمدة [time : عشر ثواني]" + }, + { + "text": "اجعل الإضاءة أكثر [color_type : دفء] بقليل هنا" + }, + { + "text": "تكفى اضبط الإضاءة وخلها [color_type : مريحة للقراءة]" + }, + { + "text": "طف لمبة [house_place : الحمام]" + }, + { + "text": "olly اخفت النور في [house_place : المجلس]" + }, + { + "text": "طفي لمبات [house_place : غرفة النوم]" + }, + { + "text": "أوزن اللمبة [change_amount : على عشرين بالمية]" + }, + { + "text": "olly حط الإضاءة على [change_amount : عشرين في المية]" + }, + { + "text": "خفت الأضواء في [house_place : المطبخ] أولي" + }, + { + "text": "اخفت اللمبات في [house_place : المطبخ]" + }, + { + "text": "olly نظف [house_place : الشقة]" + }, + { + "text": "نظف [house_place : البيت] بالمكنسة الكهربائية" + }, + { + "text": "كنس [house_place : البيت] olly" + }, + { + "text": "اكنس [house_place : السجادات]" + }, + { + "text": "أبغى اسمع أغنية [artist_name : محمد عبده] مرة ثانية" + }, + { + "text": "أريد تشغيل هذي [media_type : الموسيقا] مرة ثانية" + }, + { + "text": "قل لي الوقت في [place_name : الرياض]" + }, + { + "text": "قل لي الوقت في [time_zone : توقيت السعودية]" + }, + { + "text": "[order_type : توصيل] [food_type : صيني]" + }, + { + "text": "أكثر خيارات [order_type : التوصيل] تقيماً للاطعمة [food_type : الحجازية]" + }, + { + "text": "olly وش المطاعم الاكثر تقييم للاكل [food_type : الحجازي] و [order_type : يوصلوا]" + }, + { + "text": "[food_type : هندي]" + }, + { + "text": "[food_type : كاري] أي أولاي" + }, + { + "text": "بيتزايتي [food_type : بيتزا] [order_type : توصيل] [place_name : مطعم الشرق]" + }, + { + "text": "وقف منبه [time : سبعة الصبح]" + }, + { + "text": "إيش أخبار [news_topic : كرة القدم] [date : اليوم]" + }, + { + "text": "من فضلك شغل [artist_name : البيتلز] [song_name : أمس]" + }, + { + "text": "أنا أحب موسيقى [music_genre : الروك]" + }, + { + "text": "حقي مفضل موسيقى فرقة [artist_name : كاظم الساهر]" + }, + { + "text": "اترجاك رفع الأضواء إلى [change_amount : الحد الأقصى]" + }, + { + "text": "شغل [device_type : المكنسة الكهربائية]" + }, + { + "text": "شغل [device_type : المكنسة الروبوتية]" + }, + { + "text": "اطلب [food_type : مقلوبة] [meal_type : للعشا]" + }, + { + "text": "[food_type : برغر]" + }, + { + "text": "عادي أطلب [meal_type : عشا] [order_type : سفري] من [business_name : مطعم الطازج]" + }, + { + "text": "هل يدعم وجبات [business_name : هرفي] [order_type : يوصل الطلبات]" + }, + { + "text": "حط المنبه على [time : اثنعش]" + }, + { + "text": "حط منبه بعد [time : أربعين دقيقة من الحين]" + }, + { + "text": "حط منبه على [time : ثمنية] [general_frequency : كل يوم في الإسبوع]" + }, + { + "text": "هل [weather_descriptor : تمطر]" + }, + { + "text": "سوف [weather_descriptor : تمطر]" + }, + { + "text": "هل [weather_descriptor : تثلج] حاليا" + }, + { + "text": "ما هو الطقس [date : هذا الاسبوع]" + }, + { + "text": "قل لي أخبار [media_type : بي. بي. سي.]" + }, + { + "text": "ايش الأخبار على [media_type : العربية]" + }, + { + "text": "ما هي أخبار [media_type : الجزيرة]" + }, + { + "text": "شغل [artist_name : أهواك]" + }, + { + "text": "بعض [artist_name : لعب سيء]" + }, + { + "text": "[player_setting : شغل بترتيب عشوائي] قايمة الأغاني هذي" + }, + { + "text": "قول لي نكتة [joke_type : جيدة]" + }, + { + "text": "خبرني عن [date : اليوم]" + }, + { + "text": "اطلب [food_type : pizza]" + }, + { + "text": "[food_type : بيتزا] من [business_name : مطعم الشرقي]" + }, + { + "text": "[order_type : سفري]" + }, + { + "text": "كيف وضع [order_type : التوصيل] عند [business_name : domino's]" + }, + { + "text": "شغل قائمة التشغيل [music_genre : جاز] الخاص بي" + }, + { + "text": "حقي [music_genre : الشعبية] قائمة التشغيل" + }, + { + "text": "[music_genre : jazz]" + }, + { + "text": "هل يمكنك أن تعزف شوية موسيقى [music_genre : jazz]" + }, + { + "text": "فعله [player_setting : تشغيل عشوائي]" + }, + { + "text": "حط السطوع [change_amount : على خمسين في الميه]" + }, + { + "text": "حط المنبه على [time : عشرة الصباح]" + }, + { + "text": "أخبرني بآخر أخبار [news_topic : التقنية]" + }, + { + "text": "انطيني آخر أخبار [news_topic : التكنولوجيا]" + }, + { + "text": "هل [weather_descriptor : تمطر] الحين" + }, + { + "text": "شغلي قايمة تشغيل [artist_name : كاظم الساهر] حقتي" + }, + { + "text": "شغل [artist_name : بلقيس] من قايمة الأغاني حقتي" + }, + { + "text": "شغل [song_name : الأغنية الأخيرة] من قائمتي المفضلة" + }, + { + "text": "[device_type : plug] turn on" + }, + { + "text": "شغل [device_type : الفيش]" + }, + { + "text": "شغل [device_type : الفيش] حقي" + }, + { + "text": "[food_type : الدومينو] [order_type : takeaway]" + }, + { + "text": "[food_type : بيتزا] [order_type : توصيل]" + }, + { + "text": "[order_type : جاهز] [food_type : اسباني]" + }, + { + "text": "كيف الجو في [place_name : الرياض]" + }, + { + "text": "قل لي الطقس في [place_name : أبوظبي]" + }, + { + "text": "لو سمحت خل اضاءة [color_type : القراءه لطيفه]" + }, + { + "text": "تكفى خل اللمبات [color_type : مريحه للنظر]" + }, + { + "text": "اجعل [house_place : الغرفة] أكثر إنارة" + }, + { + "text": "كم التاريخ [date : اليوم]" + }, + { + "text": "ما هو [date : اليوم]" + }, + { + "text": "قاعدة [weather_descriptor : تمطر] برا يا olly ولا لا" + }, + { + "text": "غير اللمبات وخليها [color_type : خضرا]" + }, + { + "text": "تكفى نظف [house_place : الأرضية]" + }, + { + "text": "شغل [device_type : فيش قدر الضغط]" + }, + { + "text": "turn off [device_type : rice cooker socket]" + }, + { + "text": "[food_type : سوشي] من [business_name : شيرو]" + }, + { + "text": "حط منبه على [time : أربعة] [timeofday : العصر]" + }, + { + "text": "olly نبهني الساعة [time : ثلاثة بعد الظهر] اروح [event_name : الحفلة]" + }, + { + "text": "نبهني [time : ثلاثة بعد الظهر] عشان أروح [event_name : الحفلة]" + }, + { + "text": "أبي أعرف اذا ضبطت أنا منبه [event_name : لرحلة] [timeofday : الصبح]" + }, + { + "text": "هل [weather_descriptor : تمطر] في [place_name : عسير]" + }, + { + "text": "هل سوف [weather_descriptor : تمطر] [date : اليوم]" + }, + { + "text": "ماذا يحدث في [place_name : العالم]" + }, + { + "text": "ماذا يحدث في [place_name : الرياض]" + }, + { + "text": "اعرض لي شوية أخبار من [media_type : العربية]" + }, + { + "text": "أظهر بعض الأخبار من [media_type : الجزيرة]" + } +] \ No newline at end of file diff --git a/tests/data/ch_ner/test.json b/tests/data/ch_ner/test.json new file mode 100644 index 0000000..ab44208 --- /dev/null +++ b/tests/data/ch_ner/test.json @@ -0,0 +1,62 @@ +[ + { + "text": "[date : 这周] [time : 五点] 叫我起床" + }, + { + "text": "我们想要 [color_type : 粉红色]" + }, + { + "text": "小王关闭了 [house_place : 卧室里的] 灯" + }, + { + "text": "[house_place : 走廊]" + }, + { + "text": "[place_name : 澳大利亚] 现在几点" + }, + { + "text": "olly 告诉我 [time_zone : 格林威治时间加五] 的时间" + }, + { + "text": "关闭我 [time : 七点] 的闹钟" + }, + { + "text": "我喜欢 [artist_name : 张杰] 的歌曲" + }, + { + "text": "告诉我 [date : 今天的] 日期" + }, + { + "text": "[date : 今天] 是几号" + }, + { + "text": "你能为 [timeofday : 今晚] 的 [meal_type : 晚餐] 订 [food_type : 寿司] 吗" + }, + { + "text": "[time : 这周的] 预报啥情况" + }, + { + "text": "[date : 这周的] 天气怎么样啊" + }, + { + "text": "告诉我 [date : 这周] 的天气" + }, + { + "text": "点 [order_type : 外卖]" + }, + { + "text": "他们提供 [order_type : 外卖] 吗" + }, + { + "text": "他们送 [order_type : 外送] 吗" + }, + { + "text": "他们 [order_type : 外送] 上门吗" + }, + { + "text": "olly 他们有 [order_type : 外送] 上门吗" + }, + { + "text": "定个起床闹钟 [date : 星期四] [time : 七点]" + } +] \ No newline at end of file diff --git a/tests/data/ch_ner/train.json b/tests/data/ch_ner/train.json new file mode 100644 index 0000000..778e4c6 --- /dev/null +++ b/tests/data/ch_ner/train.json @@ -0,0 +1,302 @@ +[ + { + "text": "[date : 星期五] 早上 [time : 九点] 叫醒我" + }, + { + "text": "设个 [time : 两小时后] 的闹钟" + }, + { + "text": "欧丽暂停 [time : 十秒]" + }, + { + "text": "暂停 [time : 十秒钟]" + }, + { + "text": "让这里的灯光更加 [color_type : 温暖] 一点" + }, + { + "text": "请设置 [color_type : 适合阅读的] 灯光" + }, + { + "text": "关掉 [house_place : 浴室] 的灯" + }, + { + "text": "olly 把 [house_place : 大厅] 里的灯调暗" + }, + { + "text": "关掉 [house_place : 卧室的] 灯" + }, + { + "text": "设置灯为 [change_amount : 百分之二十]" + }, + { + "text": "olly 把灯光调到 [change_amount : 两成]" + }, + { + "text": "小度变暗 [house_place : 厨房] 灯光" + }, + { + "text": "调暗 [house_place : 厨房] 灯" + }, + { + "text": "olly 打扫 [house_place : 公寓]" + }, + { + "text": "给 [house_place : 房子] 吸尘" + }, + { + "text": "小度把 [house_place : 房子] 用吸尘器吸干净" + }, + { + "text": "[house_place : 地毯]" + }, + { + "text": "我想再听一次 [artist_name : 周杰伦] 的歌" + }, + { + "text": "我想再次播放那首 [media_type : 音乐]" + }, + { + "text": "告诉我 [place_name : 莫斯科] 时间" + }, + { + "text": "告诉我 [time_zone : 格林威治时间加五] 是几点" + }, + { + "text": "欧丽列出所有评分高的 [food_type : 中] 式 [order_type : 外送]" + }, + { + "text": "评分最高的 [food_type : 中餐] [order_type : 外送]" + }, + { + "text": "olly 评分最高的 [food_type : 中] 餐 [order_type : 外送]" + }, + { + "text": "我想要吃一些 [food_type : 咖喱] 有什么推荐吗" + }, + { + "text": "天猫精灵如果有任何推荐的话我想吃点 [food_type : 咖喱]" + }, + { + "text": "在 [place_name : 三源里菜市场] 附近找我的 [food_type : 泰国菜] [order_type : 外卖]" + }, + { + "text": "停止 [time : 七点] 的闹钟" + }, + { + "text": "[date : 今天] 有什么关于 [news_topic : 足球] 的消息吗" + }, + { + "text": "请播放 [artist_name : 飞轮海] 的 [song_name : 只对你有感觉]" + }, + { + "text": "我喜欢 [music_genre : 摇滚] 音乐" + }, + { + "text": "[artist_name : 告五人] 我的最喜爱音乐乐队" + }, + { + "text": "请调亮灯 [change_amount : 到最大]" + }, + { + "text": "启动 [device_type : 扫地机器人]" + }, + { + "text": "启动 [device_type : 扫地机器人]" + }, + { + "text": "请为 [meal_type : 晚餐] 订一些 [food_type : 寿司]" + }, + { + "text": "我想你下单 [food_type : 汉堡包]" + }, + { + "text": "我能从 [business_name : 萨莉亚意式餐厅] 点 [order_type : 外卖] 当 [meal_type : 晚餐] 吗" + }, + { + "text": "[business_name : 拜伦] 有 [order_type : 外卖] 吗" + }, + { + "text": "定个 [time : 十二点] 的闹钟" + }, + { + "text": "设置一个 [time : 四十分钟后] 的闹钟" + }, + { + "text": "设个 [general_frequency : 每个工作日] [time : 八点] 的闹钟" + }, + { + "text": "[weather_descriptor : 下雨] 了吗" + }, + { + "text": "要 [weather_descriptor : 下雨] 了嘛" + }, + { + "text": "正在 [weather_descriptor : 下雪] 吗" + }, + { + "text": "[date : 这周的] 天气如何" + }, + { + "text": "告诉我 [media_type : 环球时报] 的新闻" + }, + { + "text": "[media_type : 中国新闻网] 有什么新闻" + }, + { + "text": "[media_type : 呼市新闻] 的最新新聞是什么" + }, + { + "text": "播放 [artist_name : 五月天]" + }, + { + "text": "放一些 [artist_name : 五月天] 的音乐" + }, + { + "text": "[player_setting : 随机播放] 此播放列表" + }, + { + "text": "给我讲一个 [joke_type : 有趣的] 笑话" + }, + { + "text": "告诉我 [date : 今天] 的情况" + }, + { + "text": "下单 [food_type : 比萨]" + }, + { + "text": "从 [business_name : 美团外卖] 点 [food_type : 比萨]" + }, + { + "text": "我的 [order_type : 外卖] 还有多久" + }, + { + "text": "[business_name : 多米诺的] [order_type : 运送] 情况" + }, + { + "text": "播放我的 [music_genre : 爵士乐] 播放列表" + }, + { + "text": "开始我的 [music_genre : 爵士乐] 播放列表" + }, + { + "text": "我喜欢 [music_genre : 爵士乐]" + }, + { + "text": "你能播放一些 [music_genre : 爵士] 乐吗" + }, + { + "text": "启用 [player_setting : 随机播放]" + }, + { + "text": "将亮度设置 [change_amount : 到五成]" + }, + { + "text": "定个 [time : 十点] 闹钟" + }, + { + "text": "告诉我最新的 [news_topic : 科技] 消息" + }, + { + "text": "告诉我最新的 [news_topic : 科技] 消息" + }, + { + "text": "现在 [weather_descriptor : 下雨] 了吗" + }, + { + "text": "打开我的 [artist_name : 周杰伦] 歌单" + }, + { + "text": "从我的播放列表播放 [artist_name : 陈奕迅] 的歌" + }, + { + "text": "[song_name : 最后一首]" + }, + { + "text": "开启 [device_type : 智能插座]" + }, + { + "text": "打开 [device_type : 插头]" + }, + { + "text": "开启我的 [device_type : 智能插座]" + }, + { + "text": "[food_type : 必胜客] 做 [order_type : 外卖] 吗" + }, + { + "text": "我最喜欢的 [food_type : 比萨] 可以点 [order_type : 外卖] 吗" + }, + { + "text": "[order_type : 外带] [food_type : 西班牙] 的" + }, + { + "text": "[place_name : 北京] 天气咋样啊" + }, + { + "text": "告诉我 [place_name : 上海] 的天气情况" + }, + { + "text": "调整光线为 [color_type : 适合阅读]" + }, + { + "text": "请调整灯光 [color_type : 有利于观看]" + }, + { + "text": "使得 [house_place : 房间] 更明亮" + }, + { + "text": "[date : 今天] 几月几号" + }, + { + "text": "[date : 今天] 是周几" + }, + { + "text": "小爱 [weather_descriptor : 下雨]" + }, + { + "text": "把灯光改成 [color_type : 绿色]" + }, + { + "text": "请打扫 [house_place : 地板]" + }, + { + "text": "打开 [device_type : 电饭煲插座]" + }, + { + "text": "关掉 [device_type : 电饭煲的插座]" + }, + { + "text": "你能帮我从 [business_name : 戶戶送] 订一些 [food_type : 寿司] 吗" + }, + { + "text": "设个 [timeofday : 下午] [time : 四点] 的闹钟" + }, + { + "text": "olly [time : 下午三点] 提醒我去 [event_name : 音乐会]" + }, + { + "text": "提醒我 [time : 下午三点] 去 [event_name : 音乐会]" + }, + { + "text": "[timeofday : 早上] [event_name : 航班] 我设闹钟了吗" + }, + { + "text": "[place_name : 郑州] [weather_descriptor : 下雨] 了吗" + }, + { + "text": "[date : 今天] [weather_descriptor : 下雨] 吗" + }, + { + "text": "[place_name : 世界] 上到底发生了什么" + }, + { + "text": "[place_name : 佛山] 发生了什么" + }, + { + "text": "给我展示一些来自 [media_type : 腾讯新闻] 的消息" + }, + { + "text": "展示一些 [media_type : 今日头条] 的新闻" + } +] \ No newline at end of file diff --git a/tests/data/en_ner/test.json b/tests/data/en_ner/test.json new file mode 100644 index 0000000..4338028 --- /dev/null +++ b/tests/data/en_ner/test.json @@ -0,0 +1,62 @@ +[ + { + "text": "wake me up at [time : five am] [date : this week]" + }, + { + "text": "[color_type : pink] is all we need" + }, + { + "text": "olly turn the lights off in the [house_place : bedroom]" + }, + { + "text": "hoover the [house_place : hallway]" + }, + { + "text": "what's the time in [place_name : australia]" + }, + { + "text": "olly tell me the time in [time_zone : g. m. t. plus five]" + }, + { + "text": "cancel my [time : seven am] alarm" + }, + { + "text": "i like [artist_name : senatra] songs" + }, + { + "text": "tell me [date : today's] date" + }, + { + "text": "what date is it [date : today]" + }, + { + "text": "could you order [food_type : sushi] for [timeofday : tonight] [meal_type : dinner]" + }, + { + "text": "what's the [time : week's] forecast" + }, + { + "text": "what's [date : this week's] weather" + }, + { + "text": "tell me the weather [date : this week]" + }, + { + "text": "order a [order_type : takeaway]" + }, + { + "text": "can they provide [order_type : takeaway]" + }, + { + "text": "can they do [order_type : delivery]" + }, + { + "text": "do they [order_type : deliver] home" + }, + { + "text": "olly do they [order_type : deliver] home" + }, + { + "text": "set wake up [date : thursday] [time : seven am]" + } +] \ No newline at end of file diff --git a/tests/data/en_ner/train.json b/tests/data/en_ner/train.json new file mode 100644 index 0000000..e65e4ee --- /dev/null +++ b/tests/data/en_ner/train.json @@ -0,0 +1,302 @@ +[ + { + "text": "wake me up at [time : nine am] on [date : friday]" + }, + { + "text": "set an alarm for [time : two hours from now]" + }, + { + "text": "olly pause for [time : ten seconds]" + }, + { + "text": "pause for [time : ten seconds]" + }, + { + "text": "make the lighting bit more [color_type : warm] here" + }, + { + "text": "please set the lighting [color_type : suitable for reading]" + }, + { + "text": "turn off the light in the [house_place : bathroom]" + }, + { + "text": "olly dim the lights in the [house_place : hall]" + }, + { + "text": "turn the lights off in the [house_place : bedroom]" + }, + { + "text": "set lights [change_amount : to twenty percent]" + }, + { + "text": "olly set lights [change_amount : to twenty percent]" + }, + { + "text": "dim the lights in the [house_place : kitchen] olly" + }, + { + "text": "dim the lights in the [house_place : kitchen]" + }, + { + "text": "olly clean the [house_place : flat]" + }, + { + "text": "vacuum the [house_place : house]" + }, + { + "text": "vacuum the [house_place : house] olly" + }, + { + "text": "hoover the [house_place : carpets] around" + }, + { + "text": "i want to listen [artist_name : arijit singh] song once again" + }, + { + "text": "i want to play that [media_type : music] one again" + }, + { + "text": "tell me the time in [place_name : moscow]" + }, + { + "text": "tell me the time in [time_zone : g. m. t. plus five]" + }, + { + "text": "olly list most rated [order_type : delivery] options for [food_type : chinese] food" + }, + { + "text": "most rated [order_type : delivery] options for [food_type : chinese] food" + }, + { + "text": "olly most rated [order_type : delivery] options for [food_type : chinese] food" + }, + { + "text": "i want some [food_type : curry] to go any recommendations" + }, + { + "text": "i want some [food_type : curry] to go any recommendations olly" + }, + { + "text": "find my [food_type : thai] [order_type : takeaways] around [place_name : grassmarket]" + }, + { + "text": "stop [time : seven am] alarm" + }, + { + "text": "what's happening in [news_topic : football] [date : today]" + }, + { + "text": "please play [song_name : yesterday] from [artist_name : beatles]" + }, + { + "text": "i like [music_genre : rock] music" + }, + { + "text": "my favorite music band is [artist_name : queen]" + }, + { + "text": "please raise the lights [change_amount : to max]" + }, + { + "text": "hey start [device_type : vacuum cleaner robot]" + }, + { + "text": "turn [device_type : cleaner robot] on" + }, + { + "text": "please order some [food_type : sushi] for [meal_type : dinner]" + }, + { + "text": "hey i'd like you to order [food_type : burger]" + }, + { + "text": "can i order [order_type : takeaway] [meal_type : dinner] from [business_name : byron's]" + }, + { + "text": "does [business_name : byron's] supports [order_type : takeaways]" + }, + { + "text": "set an alarm for [time : twelve]" + }, + { + "text": "set an alarm [time : forty minutes from now]" + }, + { + "text": "set alarm for [time : eight] [general_frequency : every weekday]" + }, + { + "text": "is it [weather_descriptor : raining]" + }, + { + "text": "is it going to [weather_descriptor : rain]" + }, + { + "text": "is it currently [weather_descriptor : snowing]" + }, + { + "text": "what's [date : this weeks] weather" + }, + { + "text": "tell me [media_type : b. b. c.] news" + }, + { + "text": "what's the news on [media_type : b. b. c.] news" + }, + { + "text": "what is the [media_type : b. b. c.'s] latest news" + }, + { + "text": "play [artist_name : daft punk]" + }, + { + "text": "put on some [artist_name : coldplay]" + }, + { + "text": "[player_setting : shuffle] this playlist" + }, + { + "text": "tell me a [joke_type : good] joke" + }, + { + "text": "tell me about [date : today]" + }, + { + "text": "order a [food_type : pizza]" + }, + { + "text": "order me a [food_type : byron] from [business_name : deliveroo]" + }, + { + "text": "how long until my [order_type : takeaway]" + }, + { + "text": "[business_name : domino's] [order_type : delivery] status" + }, + { + "text": "play my [music_genre : jazz] playlist" + }, + { + "text": "start my [music_genre : jazz] playlist" + }, + { + "text": "i like [music_genre : jazz]" + }, + { + "text": "can you play some [music_genre : jazz]" + }, + { + "text": "enable [player_setting : shuffle]" + }, + { + "text": "set brightness [change_amount : to fifty percent]" + }, + { + "text": "set alarm at [time : ten am]" + }, + { + "text": "tell me the latest [news_topic : technology] news" + }, + { + "text": "tell me latest [news_topic : technology] news" + }, + { + "text": "is it [weather_descriptor : raining] now" + }, + { + "text": "turn on my [artist_name : michael jackson] playlist" + }, + { + "text": "play [artist_name : michael jackson] from my playlist" + }, + { + "text": "play [song_name : last song] from my favorite playlist" + }, + { + "text": "turn on the [device_type : plug]" + }, + { + "text": "switch on the [device_type : plug]" + }, + { + "text": "turn on my [device_type : plug]" + }, + { + "text": "does [food_type : dominoes] do [order_type : takeaway]" + }, + { + "text": "does my favorite [food_type : pizza] place available for [order_type : takeaway]" + }, + { + "text": "can i order [order_type : takeaway] from [food_type : spanish] place" + }, + { + "text": "how's the weather like in [place_name : beijing]" + }, + { + "text": "tell me the weather in [place_name : shanghai]" + }, + { + "text": "please make the lights [color_type : reading friendly]" + }, + { + "text": "please make the lights [color_type : watching friendly]" + }, + { + "text": "make the [house_place : room] brighter" + }, + { + "text": "what's date [date : today]" + }, + { + "text": "what day is [date : today]" + }, + { + "text": "is it [weather_descriptor : raining] outside olly" + }, + { + "text": "change the lights into [color_type : green]" + }, + { + "text": "clean the [house_place : floor] please" + }, + { + "text": "turn on the [device_type : rice cooker socket]" + }, + { + "text": "turn off the [device_type : rice cooker socket]" + }, + { + "text": "could you please help me to order some [food_type : sushi] from [business_name : deliveroo]" + }, + { + "text": "set an alarm for [time : four] in the [timeofday : afternoon]" + }, + { + "text": "olly alert me at [time : three p. m.] to go to the [event_name : concert]" + }, + { + "text": "alert me at [time : three p. m.] to go to the [event_name : concert]" + }, + { + "text": "do i have an alarm set for [timeofday : morning] [event_name : flight]" + }, + { + "text": "is it [weather_descriptor : raining] in [place_name : barcelona]" + }, + { + "text": "will it [weather_descriptor : rain] [date : today]" + }, + { + "text": "what's going on in the [place_name : world]" + }, + { + "text": "what's happening in [place_name : cambridge]" + }, + { + "text": "show me some news from [media_type : b. b. c.]" + }, + { + "text": "olly show me some news from [media_type : b. b. c.]" + } +] \ No newline at end of file diff --git a/tests/data/fr_ner/test.json b/tests/data/fr_ner/test.json new file mode 100644 index 0000000..29dad70 --- /dev/null +++ b/tests/data/fr_ner/test.json @@ -0,0 +1,62 @@ +[ + { + "text": "réveille-moi à [time : cinq heures du matin] [date : cette semaine]" + }, + { + "text": "le [color_type : rose] est tout ce dont nous avons besoin" + }, + { + "text": "olly éteindre les lumières dans la [house_place : chambre]" + }, + { + "text": "passer l'aspirateur dans le [house_place : couloir]" + }, + { + "text": "quelle heure est-il en [place_name : australie]" + }, + { + "text": "olly donne-moi l'heure en [time_zone : g. m. t. plus cinq]" + }, + { + "text": "annuler mon alarme de [time : sept heures du matin]" + }, + { + "text": "j' aime les chansons de [artist_name : jacques brel]" + }, + { + "text": "dis-moi la date [date : d'aujourd'hui]" + }, + { + "text": "quelle date est-il [date : aujourd'hui]" + }, + { + "text": "pourrais-tu commander des [food_type : sushis] pour le [meal_type : dîner] de [timeofday : ce soir]" + }, + { + "text": "quelles sont les prévisions [time : de la semaine]" + }, + { + "text": "quel temps fait-il [date : cette semaine]" + }, + { + "text": "dis-moi quel temps il fera [date : cette semaine]" + }, + { + "text": "commande un repas [order_type : à emporter]" + }, + { + "text": "peuvent-ils proposer des plats [order_type : à emporter]" + }, + { + "text": "peuvent-ils faire la [order_type : livraison]" + }, + { + "text": "est-ce qu'ils [order_type : livrent] à domicile" + }, + { + "text": "olly est-ce qu'ils [order_type : livrent] à la maison" + }, + { + "text": "régler le réveil [date : jeudi] à [time : sept heures du matin]" + } +] \ No newline at end of file diff --git a/tests/data/fr_ner/train.json b/tests/data/fr_ner/train.json new file mode 100644 index 0000000..0a8743f --- /dev/null +++ b/tests/data/fr_ner/train.json @@ -0,0 +1,302 @@ +[ + { + "text": "réveille-moi à [time : neuf heures du matin] le [date : vendredi]" + }, + { + "text": "régler une alarme pour [time : deux heures à partir dès maintenant]" + }, + { + "text": "olly mets en pause pour [time : dix secondes]" + }, + { + "text": "pause pendant [time : dix secondes]" + }, + { + "text": "rendre la luminosité un peu plus [color_type : chaud] ici" + }, + { + "text": "merci de régler l'éclairage [color_type : adapté à la lecture]" + }, + { + "text": "éteindre la lumière dans la [house_place : salle de bain]" + }, + { + "text": "olly tamiser les lumières dans la [house_place : salle]" + }, + { + "text": "éteins les lumières dans la [house_place : chambre]" + }, + { + "text": "régler les lumières [change_amount : à vingt pour cent]" + }, + { + "text": "olly régler les lumières [change_amount : à vingt pour cent]" + }, + { + "text": "tamiser les lumières dans la [house_place : cuisine] olly" + }, + { + "text": "tamiser les lumières dans la [house_place : cuisine]" + }, + { + "text": "olly nettoie la [house_place : télévision]" + }, + { + "text": "passer l'aspirateur dans la [house_place : maison]" + }, + { + "text": "olly passer l'aspirateur dans la [house_place : maison]" + }, + { + "text": "passer l'aspirateur sur les [house_place : tapis] autour" + }, + { + "text": "je veux écouter la chanson de [artist_name : jacques brel] encore une fois" + }, + { + "text": "je veux rejouer cette [media_type : musique]" + }, + { + "text": "dis-moi l'heure à [place_name : bordeaux]" + }, + { + "text": "dis-moi l'heure en [time_zone : t.m.g. plus cinq]" + }, + { + "text": "olly liste les options de [order_type : livraison] les plus cotées pour la nourriture [food_type : chinoise]" + }, + { + "text": "les options de [order_type : livraison] les plus notées pour du [food_type : chinois]" + }, + { + "text": "olly les options de [order_type : livraison] les plus notées pour du [food_type : chinois]" + }, + { + "text": "je veux du [food_type : piment] pour toutes les recommandations" + }, + { + "text": "je veux du [food_type : curry] pour toutes les recommandations olly" + }, + { + "text": "trouve mes [order_type : plats à emporter] [food_type : thaïlandais] autour de [place_name : la concorde]" + }, + { + "text": "arrêter l'alarme de [time : sept heures du matin]" + }, + { + "text": "ce qui se passe dans le [news_topic : football] [date : aujourd'hui]" + }, + { + "text": "s'il te plais joue j' ai demandé à la lune d' [artist_name : indochine]" + }, + { + "text": "j'aime la musique [music_genre : rock]" + }, + { + "text": "ma musique favori de mon groupe [artist_name : phoenix]" + }, + { + "text": "augmenter les lumières [change_amount : au maximum] s'il vous plaît" + }, + { + "text": "salut démarrez le [device_type : robot aspirateur]" + }, + { + "text": "allumer le [device_type : robot nettoyeur]" + }, + { + "text": "s'il te plaît commande du [food_type : sushi] pour le [meal_type : dîner]" + }, + { + "text": "salut j'aimerais que tu commandes un [food_type : hamburger]" + }, + { + "text": "puis-je commander un [meal_type : dîner] [order_type : à emporter] chez [business_name : byron's]" + }, + { + "text": "est-ce que [business_name : byron's] prend en charge [order_type : les plats à emporter]" + }, + { + "text": "régler une alarme pour [time : douze]" + }, + { + "text": "régler l'alarme pour [time : huit] tous les jours de la semaine" + }, + { + "text": "il [weather_descriptor : pleut]" + }, + { + "text": "quel est le temps [date : cette semaine]" + }, + { + "text": "dis moi [media_type : b. b. c.] nouvelles" + }, + { + "text": "quelles sont les nouvelles sur [media_type : b. b. c.] nouvelles" + }, + { + "text": "quel est les dernières nouvelles [media_type : de b. b. c.]" + }, + { + "text": "joue [artist_name : daft punk]" + }, + { + "text": "joue un peu de [artist_name : coldplay]" + }, + { + "text": "[player_setting : mélange] cette liste de lecture" + }, + { + "text": "dis moi une [joke_type : bonne] blague" + }, + { + "text": "parle moi [date : d'aujourd'hui]" + }, + { + "text": "commande une [food_type : pizza]" + }, + { + "text": "commande moi un [food_type : byron] chez [business_name : deliveroo]" + }, + { + "text": "combien de temps avant ma commande [order_type : à emporter]" + }, + { + "text": "état de [order_type : livraison] de [business_name : domino's]" + }, + { + "text": "joue ma playlist [music_genre : jazz]" + }, + { + "text": "commence ma playlist de [music_genre : jazz]" + }, + { + "text": "j'aime le [music_genre : jazz]" + }, + { + "text": "peux-tu jouer du [music_genre : jazz]" + }, + { + "text": "active le [player_setting : mode aléatoire]" + }, + { + "text": "régler la luminosité [change_amount : à cinquante pour cent]" + }, + { + "text": "régler l'alarme à [time : dix heures du matin]" + }, + { + "text": "dites-moi les dernières nouvelles de la [news_topic : technologie]" + }, + { + "text": "dites-moi les dernières nouvelles de la [news_topic : technologie]" + }, + { + "text": "est-ce qu'il [weather_descriptor : pleut] maintenant" + }, + { + "text": "active ma liste de lecture [artist_name : michael jackson]" + }, + { + "text": "jouer au [artist_name : michael jackson] dans ma playlist" + }, + { + "text": "jouer [song_name : la dernière chanson] préféré de ma playlist" + }, + { + "text": "allume la [device_type : prise]" + }, + { + "text": "allume la [device_type : prise]" + }, + { + "text": "allumer ma [device_type : prise]" + }, + { + "text": "est-ce que [food_type : dominoes] font [order_type : à emporter]" + }, + { + "text": "est-ce que ma [food_type : pizzeria] préférée est disponible pour des commandes [order_type : à emporter]" + }, + { + "text": "puis-je commander un plat [food_type : espagnol] [order_type : à emporter]" + }, + { + "text": "quel temps fait-il à [place_name : paris]" + }, + { + "text": "dis-moi la météo en [place_name : monaco]" + }, + { + "text": "s'il vous plaît rendre les lumières conviviales pour la [color_type : lecture amicale]" + }, + { + "text": "s'il vous plaît rendre les lumières [color_type : amicales]" + }, + { + "text": "rends la [house_place : chambre] plus lumineuse" + }, + { + "text": "quelle est la date [date : aujourd'hui]" + }, + { + "text": "quel jour sommes-nous [date : aujourd'hui]" + }, + { + "text": "est-ce qu'il [weather_descriptor : pleut] dehors olly" + }, + { + "text": "changer les lumières en [color_type : verte]" + }, + { + "text": "nettoie le [house_place : sol] s'il te plaît" + }, + { + "text": "active la [device_type : prise de cuiseur à riz]" + }, + { + "text": "éteindre la [device_type : prise de cuiseur à riz]" + }, + { + "text": "pourriez-vous s'il vous plaît m'aider à commander des [food_type : sushis] de [business_name : deliveroo]" + }, + { + "text": "régler une alarme pour [time : quatre] heures de [timeofday : l'après-midi]" + }, + { + "text": "olly allerte moi a [time : trois heures de l'après-midi] pour que j'aille au [event_name : concert]" + }, + { + "text": "préviens-moi à [time : trois heures du soir] pour aller au [event_name : concert]" + }, + { + "text": "ai-je une alarme réglée pour le [event_name : vol] du [timeofday : matin]" + }, + { + "text": "il est [weather_descriptor : en train de pleuvoir] à [place_name : québec]" + }, + { + "text": "va-t-il [weather_descriptor : pleuvoir] [date : aujourd'hui]" + }, + { + "text": "ce qui se passe dans le [place_name : monde]" + }, + { + "text": "que se passe-t-il à [place_name : cambridge]" + }, + { + "text": "montre moi des nouvelles de [media_type : b. b. c.]" + }, + { + "text": "olly montre moi des nouvelles [media_type : b. b. c.]" + }, + { + "text": "jouer [media_type : c. n. n.] nouvelles" + }, + { + "text": "olly [media_type : hackernews]" + }, + { + "text": "met un peu de [music_genre : hihop]" + } +] \ No newline at end of file diff --git a/tests/data/hi_ner/test.json b/tests/data/hi_ner/test.json new file mode 100644 index 0000000..3f8e98d --- /dev/null +++ b/tests/data/hi_ner/test.json @@ -0,0 +1,62 @@ +[ + { + "text": "मुझे [date : इस सप्ताह] सुबह [time : पांच बजे] जगा दो" + }, + { + "text": "हमको [color_type : गुलाबी] ही चाहिए" + }, + { + "text": "ओली [house_place : शयनकक्ष] में बत्तियाँ बंद कर दें" + }, + { + "text": "हुवर को [house_place : हॉल] में चालाओ" + }, + { + "text": "[place_name : भारत] में समय क्या है" + }, + { + "text": "olly मुझे [time_zone : जी. एम. टी.] में समय बताओ" + }, + { + "text": "मेरा [time : सुबह सात बजे] का अलार्म रद्द करो" + }, + { + "text": "मुझे [artist_name : अरिजीत सिंह के] गाने पसंद है" + }, + { + "text": "[date : आज की] तारीख बताओ" + }, + { + "text": "[date : आज] क्या तारीख है" + }, + { + "text": "क्या आप [timeofday : आज रात] [meal_type : के खाने] के लिए [food_type : सूप] ऑर्डर कर सकते हैं" + }, + { + "text": "[time : सप्ताह] का पूर्वानुमान क्या है" + }, + { + "text": "[date : इस सप्ताह] का मौसम क्या है" + }, + { + "text": "मुझे [date : इस सप्ताह] का मौसम बताओ" + }, + { + "text": "[order_type : ले जाने के लिए] एक खरीदारी करो" + }, + { + "text": "क्या वे [order_type : takeaway] प्रदान कर सकते हैं" + }, + { + "text": "क्या वे [order_type : डिलीवरी] कर सकते हैं" + }, + { + "text": "क्या वें घर [order_type : deliver] करते हैं" + }, + { + "text": "ओली क्या वे घर [order_type : deliver]" + }, + { + "text": "[date : गुरुवार] सुबह [time : सात बजे] उठना सेट करें" + } +] \ No newline at end of file diff --git a/tests/data/hi_ner/train.json b/tests/data/hi_ner/train.json new file mode 100644 index 0000000..0150df7 --- /dev/null +++ b/tests/data/hi_ner/train.json @@ -0,0 +1,302 @@ +[ + { + "text": "[date : शुक्रवार] को सुबह [time : नौ बजे] मुझे जगा दो" + }, + { + "text": "[time : अभी से दो घंटे] के लिए अलार्म लगाओ" + }, + { + "text": "olly [time : दस सेकंड] के लिए रुको" + }, + { + "text": "[time : दस सेकंड] के लिए रुकें" + }, + { + "text": "यहां रोशनी को थोड़ा अधिक [color_type : गर्म] करें" + }, + { + "text": "लाइट को [color_type : पढ़ने के लिए उपयुक्त] होने तक सेट करें" + }, + { + "text": "[house_place : स्नानघर] में बत्तियाँ बंद करें" + }, + { + "text": "olly [house_place : हॉल] की लाइट्स को डिम कर दो" + }, + { + "text": "[house_place : शयनकक्ष] में बत्तियाँ बंद कर दें" + }, + { + "text": "लाइट्स को [change_amount : बीस प्रतिशत पे] सेट कर दो" + }, + { + "text": "olly लाइट्स को [change_amount : बीस प्रतिशत पर] कर दो" + }, + { + "text": "[house_place : रसोईघर] में बत्तियां कम करें olly" + }, + { + "text": "[house_place : रसोई] में रोशनी कम करें" + }, + { + "text": "olly [house_place : समतल] साफ करें" + }, + { + "text": "[house_place : मकान] साफ करो" + }, + { + "text": "olly [house_place : घर] को वैक्यूम करें" + }, + { + "text": "चारों ओर [house_place : carpets] को घेरें" + }, + { + "text": "मैं एक बार फिर [artist_name : अरिजीत सिंह] का गाना सुनना चाहता हूं" + }, + { + "text": "मैं उस [media_type : संगीत] को फिर से बजाना चाहता हूं" + }, + { + "text": "मुझे [place_name : नई दिल्ली] में समय बताओ" + }, + { + "text": "मुझे [time_zone : जी. एम. टी.] में समय बताओ" + }, + { + "text": "olly [food_type : बिहारी] भोजन के लिए सबसे अधिक रेटेड [order_type : वितरण] विकल्पों की सूची बनाएं" + }, + { + "text": "[food_type : chinese] भोजन के लिए सबसे अधिक मूल्यांकित [order_type : delivery] विकल्प" + }, + { + "text": "[food_type : चीनी] भोजन के लिए ऑली मोस्ट रेटेड [order_type : डिलीवरी] विकल्प" + }, + { + "text": "मैं कुछ [food_type : डोसा] चाहता हूँ मुझे कुछ सुझाव दो" + }, + { + "text": "olly मुझे [food_type : डोसा] खाना है इसलिए तुम मुझे कोई सुझाव दो" + }, + { + "text": "[place_name : रॉयल इंडियन] के आसपास मेरे [food_type : भारतीय] [order_type : टेकअवे] खोजें" + }, + { + "text": "[time : सुबह सात बजे] अलार्म बंद करो" + }, + { + "text": "[date : आज] [news_topic : क्रिकेट] में क्या हो रहा है" + }, + { + "text": "कृपया [artist_name : अरिजित सिंह] के द्वारा [song_name : तुम ही हो] चालू करो" + }, + { + "text": "मुझे [music_genre : राॅक] संगीत पसंद है" + }, + { + "text": "मेरा पसंदीदा संगीत बैंड [artist_name : रानी] है" + }, + { + "text": "कृपया रोशनी को [change_amount : अधिकतम तक] बढ़ाएं" + }, + { + "text": "अरे [device_type : वैक्यूम क्लीनर रोबोट] शुरू करो" + }, + { + "text": "[device_type : क्लीनर रोबोट] को चालू करो" + }, + { + "text": "कृपया मेरे [meal_type : dinner] के लिए कुछ [food_type : सूप] ऑर्डर केआर दो" + }, + { + "text": "अरे मैं चाहता हूँ की आप [food_type : burger] ऑर्डर करे" + }, + { + "text": "क्या मैं [business_name : देवा भोजनालय] से [order_type : टेकअवे] [meal_type : डिनर] मंगवा सकता हूँ" + }, + { + "text": "क्या [business_name : पिज्जा हट] पर [order_type : साथ ले जाने] की सुविधा है" + }, + { + "text": "[time : बारह] बजे के लिए अलार्म सेट करें" + }, + { + "text": "[time : अभी से चालीस मिनट] का अलार्म लगाओ" + }, + { + "text": "[general_frequency : प्रत्येक सप्ताह] के [time : आठ दिनों] के लिए अलार्म सेट करें" + }, + { + "text": "क्या [weather_descriptor : बारिश हो रही है]" + }, + { + "text": "क्या [weather_descriptor : बारिश] होने वाली है" + }, + { + "text": "क्या इस समय [weather_descriptor : बर्फ़] पड़ रही है" + }, + { + "text": "[date : इस सप्ताह] का मौसम क्या है" + }, + { + "text": "[media_type : आज तक] मुझे सुनाओ" + }, + { + "text": "[media_type : बी. बी. सी.] समाचार पर क्या खबर है" + }, + { + "text": "[media_type : इंडिया न्यूज़] ताजा खबर क्या है" + }, + { + "text": "[artist_name : उदित नारायण] चलाओ" + }, + { + "text": "कुछ [artist_name : मोहम्मद रफ़ी] का चलाओ" + }, + { + "text": "इस प्लेलिस्ट को [player_setting : शफ़ल] करें" + }, + { + "text": "मुझे एक [joke_type : अच्छा] चुटकुला सुनाओ" + }, + { + "text": "मुझे [date : आज] के बारे में बताओ" + }, + { + "text": "एक [food_type : पिज़्ज़ा] ऑर्डर करो" + }, + { + "text": "[business_name : पंजाबी ढाबा] से मेरे लिए एक [food_type : आलू] का [food_type : पराठा] ऑर्डर करो" + }, + { + "text": "कब तक मेरा [order_type : टेकअवे]" + }, + { + "text": "[business_name : domino's] चीनी [order_type : delivery] करें" + }, + { + "text": "मेरी [music_genre : भक्ति] प्लेलिस्ट चलाओ" + }, + { + "text": "मेरे [music_genre : भक्ति] वाले प्लेलिस्ट को चालू करो" + }, + { + "text": "मैं [music_genre : भजन] पसंद करता हूँ" + }, + { + "text": "क्या आप कुछ [music_genre : भड़कीला] गाना गाना प्ले कर सकते हैं" + }, + { + "text": "[player_setting : फेरबदल] सक्षम करें" + }, + { + "text": "चमक को [change_amount : पचास प्रतिशत तक] सेट करें" + }, + { + "text": "[time : प्रातः दस] दस बजे अलार्म लगाओ" + }, + { + "text": "मुझे नवीनतम [news_topic : प्रौद्योगिकी] समाचार बताओ" + }, + { + "text": "मुझे नवीनतम [news_topic : प्रौद्योगिकी] के समाचार बताओ" + }, + { + "text": "यह अब [weather_descriptor : बारिश हो रही है]" + }, + { + "text": "मेरा [artist_name : अरिजीत सिंह] प्लेलिस्ट चालू करें" + }, + { + "text": "मेरी प्लेलिस्ट से [artist_name : शंकर महादेवन] चालू करें" + }, + { + "text": "मेरी पसंदीदा प्लेलिस्ट का [song_name : आखिरी गाना] चलाओ" + }, + { + "text": "[device_type : प्लग] चालू करें" + }, + { + "text": "[device_type : प्लग] चालू करें" + }, + { + "text": "मेरा [device_type : प्लग] चालू करो" + }, + { + "text": "क्या [food_type : dominos] [order_type : डिलीवरी] करता है" + }, + { + "text": "क्या मेरा पसंदीदा [food_type : पिज़्ज़ा] स्थान [order_type : टेकअवे] के लिए उपलब्ध है" + }, + { + "text": "क्या मैं [food_type : स्पेनिश] जगह से [order_type : टेकअवे] ऑर्डर कर सकता हूं" + }, + { + "text": "[place_name : बीजिंग] में मौसम कैसा है" + }, + { + "text": "मुझे [place_name : शंघाई] में मौसम बताओ" + }, + { + "text": "कृपया बत्तियों को [color_type : पढ़ने के अनुकूल] बनाएं" + }, + { + "text": "कृपया रोशनी को [color_type : अनुकूल] बनाओ" + }, + { + "text": "[house_place : कमरे] को उज्जवल बनाना" + }, + { + "text": "[date : आज] की तारीख क्या है" + }, + { + "text": "[date : आज] कौन सा दिन है" + }, + { + "text": "शिमला के बाहर क्या [weather_descriptor : बारिश हो रही है]" + }, + { + "text": "रोशनी को [color_type : हरे] रंग में बदलें" + }, + { + "text": "कृपया [house_place : फर्श] साफ करें" + }, + { + "text": "[device_type : चावल कुकर सॉकेट] चालू करें" + }, + { + "text": "[device_type : चावल कुकर सॉकेट] बंद कर दें" + }, + { + "text": "क्या आप [business_name : होटल ग्रेट इंडिया] से कुछ [food_type : सूप] मंगवाने में मेरी मदद कर सकते हैं" + }, + { + "text": "[timeofday : दोपहर] के [time : चार] बजे का अलार्म लगाओ" + }, + { + "text": "ओली ने मुझे [time : तीन बजे] सचेत किया संगीत [event_name : कार्यक्रम] में जाने के लिए" + }, + { + "text": "मुझे [time : तीन बजे] सचेत करें [event_name : संगीत कार्यक्रम] में जाने के लिए" + }, + { + "text": "क्या मेरे पास [timeofday : सुबह] की [event_name : उड़ान] के लिए अलार्म सेट है" + }, + { + "text": "क्या [place_name : बाराबंकी] में [weather_descriptor : बारिश हो रही है]" + }, + { + "text": "क्या [date : आज] [weather_descriptor : बारिश] होगी" + }, + { + "text": "[place_name : दुनिया] में क्या हो रहा है" + }, + { + "text": "[place_name : भोपाल] में क्या हो रहा है" + }, + { + "text": "मुझे [media_type : आज तक] से कुछ समाचार दिखाओ" + }, + { + "text": "olly मुझे [media_type : आज तक] से कुछ समाचार दिखाओ" + } +] \ No newline at end of file diff --git a/tests/data/ru_ner/test.json b/tests/data/ru_ner/test.json new file mode 100644 index 0000000..f13c91d --- /dev/null +++ b/tests/data/ru_ner/test.json @@ -0,0 +1,62 @@ +[ + { + "text": "разбуди меня в [time : пять утра] на [date : этой неделе]" + }, + { + "text": "[color_type : розовый] это то что нам надо" + }, + { + "text": "olly выключи свет в [house_place : спальне]" + }, + { + "text": "пропылесось в [house_place : прихожей]" + }, + { + "text": "сколько времени в [place_name : австралии]" + }, + { + "text": "olly скажи мне время по [time_zone : плюс пять по всемирному времени]" + }, + { + "text": "отмени мой будильник на [time : семь утра]" + }, + { + "text": "я люблю песни [artist_name : леонида агутина]" + }, + { + "text": "скажи мне [date : сегодняшнее] число" + }, + { + "text": "какая [date : сегодня] дата" + }, + { + "text": "можешь заказать [food_type : суши] на [meal_type : ужин] [timeofday : сегодня вечером]" + }, + { + "text": "какой прогноз на [time : эту неделю]" + }, + { + "text": "какая будет погода [date : на этой неделе]" + }, + { + "text": "расскажи о погоде [date : на эту неделю]" + }, + { + "text": "закажи [order_type : на вынос]" + }, + { + "text": "у них можно заказать [order_type : на вынос]" + }, + { + "text": "у них есть [order_type : доставка]" + }, + { + "text": "[order_type : доставляют] ли они домой" + }, + { + "text": "оля [order_type : доставляют] ли они на дом" + }, + { + "text": "установить подъём в [time : семь утра] в [date : четверг]" + } +] \ No newline at end of file diff --git a/tests/data/ru_ner/train.json b/tests/data/ru_ner/train.json new file mode 100644 index 0000000..9a13898 --- /dev/null +++ b/tests/data/ru_ner/train.json @@ -0,0 +1,302 @@ +[ + { + "text": "разбуди меня в [time : девять утра] в [date : пятницу]" + }, + { + "text": "поставь будильник [time : на два часа вперед]" + }, + { + "text": "олли остановись на [time : десять секунд]" + }, + { + "text": "остановись на [time : десять секунд]" + }, + { + "text": "сделай освещение здесь чуть более [color_type : тёплым]" + }, + { + "text": "пожалуйста сделай свет [color_type : подходящий для чтения]" + }, + { + "text": "выключи свет в [house_place : ванной]" + }, + { + "text": "olly приглуши светильники в [house_place : холле]" + }, + { + "text": "выключи свет в [house_place : спальне]" + }, + { + "text": "установи светильники [change_amount : на двадцать процентов]" + }, + { + "text": "olly установи уровень света на [change_amount : двадцать процентов]" + }, + { + "text": "olly приглуши свет на [house_place : кухне]" + }, + { + "text": "приглуши свет на [house_place : кухне]" + }, + { + "text": "olly начни уборку в [house_place : квартире]" + }, + { + "text": "пропылесось [house_place : весь дом]" + }, + { + "text": "пропылесось [house_place : дом] olly" + }, + { + "text": "пропылесосить [house_place : ковры] вокруг" + }, + { + "text": "хочу услышать песню [artist_name : светланы лободы] снова" + }, + { + "text": "я хочу воспроизвести ту [media_type : музыку] снова" + }, + { + "text": "скажи мне время в [place_name : москве]" + }, + { + "text": "скажи мне какое [time_zone : среднее время по гринвичу плюс пять]" + }, + { + "text": "olly назови самые популярные сервисы [order_type : доставки] [food_type : китайской] еды" + }, + { + "text": "самые популярные варианты [order_type : доставки] [food_type : китайской] еды" + }, + { + "text": "алиса [order_type : доставка] [food_type : китайской] еды с самым высоким рейтингом" + }, + { + "text": "я хочу немного [food_type : карри] с собой рекомендации" + }, + { + "text": "я хочу немного [food_type : карри] есть какие нибудь предложения olly" + }, + { + "text": "найди мои [food_type : тайские] [order_type : блюда на вынос] в [place_name : панинаро]" + }, + { + "text": "останови будильник на [time : семь утра]" + }, + { + "text": "что происходит [date : сегодня] в [news_topic : футболе]" + }, + { + "text": "поставь пожалуйста [artist_name : кравц] [song_name : обнуляй]" + }, + { + "text": "я люблю музыку в стиле [music_genre : рок]" + }, + { + "text": "моя любимая музыкальная группа это [artist_name : чайф]" + }, + { + "text": "включи свет [change_amount : на максимум]" + }, + { + "text": "эй включи [device_type : робот пылесос]" + }, + { + "text": "включи [device_type : робот пылесос]" + }, + { + "text": "пожалуйста закажи немного [food_type : суши] на [meal_type : ужин]" + }, + { + "text": "эй я хочу чтобы ты заказал [food_type : бургер]" + }, + { + "text": "можно заказать в [business_name : гинзе] [meal_type : ужин] [order_type : на вынос]" + }, + { + "text": "в [business_name : панинаро] можно делать [order_type : заказ на вынос]" + }, + { + "text": "установить будильник на [time : двенадцать]" + }, + { + "text": "установить будильник который прозвенит [time : через сорок минут]" + }, + { + "text": "поставь будильник на [time : восемь] [general_frequency : каждый будний день]" + }, + { + "text": "[weather_descriptor : идёт дождь]" + }, + { + "text": "там собирается [weather_descriptor : дождь]" + }, + { + "text": "там сейчас [weather_descriptor : идёт снег]" + }, + { + "text": "какая погода на [date : этой неделе]" + }, + { + "text": "расскажи новости [media_type : russia today]" + }, + { + "text": "какие новости по [media_type : b b c] news" + }, + { + "text": "какие последние новости в [media_type : яндекс новостях]" + }, + { + "text": "играй [artist_name : арию]" + }, + { + "text": "поставь [artist_name : короля и шута]" + }, + { + "text": "[player_setting : перемешай] этот плейлист" + }, + { + "text": "расскажи мне [joke_type : хорошую] шутку" + }, + { + "text": "расскажи о [date : сегодняшнем дне]" + }, + { + "text": "закажи [food_type : пиццу]" + }, + { + "text": "закажи мне [food_type : шаурму] из [business_name : кулинара]" + }, + { + "text": "сколько еще будет готовиться мой заказ [order_type : на вынос]" + }, + { + "text": "статус [order_type : доставки] в [business_name : domino's]" + }, + { + "text": "включи мой [music_genre : джаз] плейлист" + }, + { + "text": "включи мой [music_genre : джаз] плейлист" + }, + { + "text": "мне нравится [music_genre : джаз]" + }, + { + "text": "ты можешь сыграть немного [music_genre : джаза]" + }, + { + "text": "включи [player_setting : перетасовку]" + }, + { + "text": "установи яркость на [change_amount : пятьдесят процентов]" + }, + { + "text": "установить будильник на [time : десять утра]" + }, + { + "text": "расскажи о последних [news_topic : технологических] новостях" + }, + { + "text": "расскажи мне последние новости о [news_topic : технологиях]" + }, + { + "text": "сейчас [weather_descriptor : идёт дождь]" + }, + { + "text": "включи мой плейлист [artist_name : полины гагариной]" + }, + { + "text": "включи [artist_name : ивана дорна] из моего плейлиста" + }, + { + "text": "воспроизвести [song_name : последнюю песню] из моего любимого плейлиста" + }, + { + "text": "включи [device_type : розетку]" + }, + { + "text": "включи [device_type : розетку]" + }, + { + "text": "включи мою [device_type : розетку]" + }, + { + "text": "в [food_type : dominoes] можно взять [order_type : на вынос]" + }, + { + "text": "можно ли в моей любимой [food_type : пицце] заказать [order_type : на вынос]" + }, + { + "text": "могу я заказать [order_type : на вынос] из [food_type : испанского] заведения" + }, + { + "text": "какая погода в [place_name : пензе]" + }, + { + "text": "расскажи о погоде в [place_name : шадринске]" + }, + { + "text": "пожалуйста сделай свет [color_type : удобным для чтения]" + }, + { + "text": "сделай освещение [color_type : подходящим для просмотра]" + }, + { + "text": "сделай [house_place : комнату] светлее" + }, + { + "text": "какое [date : сегодня] число" + }, + { + "text": "какой день недели [date : сегодня]" + }, + { + "text": "на улице [weather_descriptor : идёт дождь] алиса" + }, + { + "text": "измени свет на [color_type : зеленый]" + }, + { + "text": "вымой [house_place : пол] пожалуйста" + }, + { + "text": "включи [device_type : розетку для рисоварки]" + }, + { + "text": "выключи [device_type : розетку рисоварки]" + }, + { + "text": "можешь помочь мне заказать немного [food_type : суши] в [business_name : достаевском]" + }, + { + "text": "поставить будильник на [time : четыре] [timeofday : после полудня]" + }, + { + "text": "olly уведоми меня в [time : три часа дня] пойти на [event_name : концерт]" + }, + { + "text": "уведоми меня в [time : три часа дня] пойти на [event_name : концерт]" + }, + { + "text": "имею ли я будильник на [timeofday : утренний] [event_name : рейс]" + }, + { + "text": "сейчас в [place_name : казани] [weather_descriptor : идёт дождь]" + }, + { + "text": "[date : сегодня] будет [weather_descriptor : дождь]" + }, + { + "text": "что происходит в [place_name : мире]" + }, + { + "text": "что происходит в [place_name : твери]" + }, + { + "text": "покажи мне несколько новостей [media_type : т.а.с.с.]" + }, + { + "text": "olly покажи какие нибудь новости с [media_type : b. b. c.]" + } +] \ No newline at end of file diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py new file mode 100644 index 0000000..6279773 --- /dev/null +++ b/tests/test_tokenizer.py @@ -0,0 +1,66 @@ +import json +from pathlib import Path + +import pytest + +from open_autonlu.data.ner_data_provider import SimpleNerDataProvider +from open_autonlu.data.utils import tokenize_with_offsets + +_DATA_DIR = Path(__file__).parent / "data" +_provider = SimpleNerDataProvider(train_path="unused.json") + + +def _load_ner_texts() -> list[tuple[str, str]]: + out: list[tuple[str, str]] = [] + for path in sorted(_DATA_DIR.glob("*_ner/*.json")): + lang = path.parent.name.removesuffix("_ner") + split = path.stem + with open(path, encoding="utf-8") as f: + for i, rec in enumerate(json.load(f)): + if isinstance(rec, dict): + parsed = _provider._parse_record(rec) + out.append((f"{lang}_{split}_{i}", parsed["text"])) + return out + + +_NER_TEXTS = _load_ner_texts() + + +# We assert: +# 1. text[tok.start:tok.stop] == tok.text, +# 2. no overlapping tokens, +# 3. every non-whitespace character is in exactly one token. + + +class TestTokenizeBasicContract: + @pytest.mark.parametrize( + "text", + [""] + [t for _, t in _NER_TEXTS], + ids=["empty"] + [tid for tid, _ in _NER_TEXTS], + ) + def test_offset_consistency(self, text: str): + tokens = tokenize_with_offsets(text) + for tok in tokens: + assert text[tok.start : tok.stop] == tok.text + + @pytest.mark.parametrize( + "text", [t for _, t in _NER_TEXTS], ids=[tid for tid, _ in _NER_TEXTS] + ) + def test_no_overlap(self, text: str): + tokens = tokenize_with_offsets(text) + for a, b in zip(tokens, tokens[1:]): + assert a.stop <= b.start + + @pytest.mark.parametrize( + "text", [t for _, t in _NER_TEXTS], ids=[tid for tid, _ in _NER_TEXTS] + ) + def test_full_coverage(self, text: str): + tokens = tokenize_with_offsets(text) + for tok in tokens: + assert 0 <= tok.start <= tok.stop <= len(text) + covered = set() + for tok in tokens: + covered.update(range(tok.start, tok.stop)) + for i, c in enumerate(text): + if not c.isspace(): + assert i in covered, f"Non-whitespace at {i!r} not in any token" diff --git a/uv.lock b/uv.lock index 3e90f78..0dc338d 100644 --- a/uv.lock +++ b/uv.lock @@ -137,15 +137,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/db/33/ef2f2409450ef6daa61459d5de5c08128e7d3edb773fefd0a324d1310238/altair-6.0.0-py3-none-any.whl", hash = "sha256:09ae95b53d5fe5b16987dccc785a7af8588f2dca50de1e7a156efa8a461515f8", size = 795410, upload-time = "2025-11-12T08:59:09.804Z" }, ] -[[package]] -name = "annotated-doc" -version = "0.0.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/57/ba/046ceea27344560984e26a590f90bc7f4a75b06701f653222458922b558c/annotated_doc-0.0.4.tar.gz", hash = "sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4", size = 7288, upload-time = "2025-11-10T22:07:42.062Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl", hash = "sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320", size = 5303, upload-time = "2025-11-10T22:07:40.673Z" }, -] - [[package]] name = "annotated-types" version = "0.7.0" @@ -235,24 +226,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" }, ] -[[package]] -name = "blis" -version = "1.3.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/d0/d0/d8cc8c9a4488a787e7fa430f6055e5bd1ddb22c340a751d9e901b82e2efe/blis-1.3.3.tar.gz", hash = "sha256:034d4560ff3cc43e8aa37e188451b0440e3261d989bb8a42ceee865607715ecd", size = 2644873, upload-time = "2025-11-17T12:28:30.511Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/16/d1/429cf0cf693d4c7dc2efed969bd474e315aab636e4a95f66c4ed7264912d/blis-1.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2a1c74e100665f8e918ebdbae2794576adf1f691680b5cdb8b29578432f623ef", size = 6929663, upload-time = "2025-11-17T12:27:44.482Z" }, - { url = "https://files.pythonhosted.org/packages/11/69/363c8df8d98b3cc97be19aad6aabb2c9c53f372490d79316bdee92d476e7/blis-1.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3f6c595185176ce021316263e1a1d636a3425b6c48366c1fd712d08d0b71849a", size = 1230939, upload-time = "2025-11-17T12:27:46.19Z" }, - { url = "https://files.pythonhosted.org/packages/96/2a/fbf65d906d823d839076c5150a6f8eb5ecbc5f9135e0b6510609bda1e6b7/blis-1.3.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d734b19fba0be7944f272dfa7b443b37c61f9476d9ab054a9ac53555ceadd2e0", size = 2818835, upload-time = "2025-11-17T12:27:48.167Z" }, - { url = "https://files.pythonhosted.org/packages/d5/ad/58deaa3ad856dd3cc96493e40ffd2ed043d18d4d304f85a65cde1ccbf644/blis-1.3.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ef6d6e2b599a3a2788eb6d9b443533961265aa4ec49d574ed4bb846e548dcdb", size = 11366550, upload-time = "2025-11-17T12:27:49.958Z" }, - { url = "https://files.pythonhosted.org/packages/78/82/816a7adfe1f7acc8151f01ec86ef64467a3c833932d8f19f8e06613b8a4e/blis-1.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8c888438ae99c500422d50698e3028b65caa8ebb44e24204d87fda2df64058f7", size = 3023686, upload-time = "2025-11-17T12:27:52.062Z" }, - { url = "https://files.pythonhosted.org/packages/1e/e2/0e93b865f648b5519360846669a35f28ee8f4e1d93d054f6850d8afbabde/blis-1.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8177879fd3590b5eecdd377f9deafb5dc8af6d684f065bd01553302fb3fcf9a7", size = 14250939, upload-time = "2025-11-17T12:27:53.847Z" }, - { url = "https://files.pythonhosted.org/packages/20/07/fb43edc2ff0a6a367e4a94fc39eb3b85aa1e55e24cc857af2db145ce9f0d/blis-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:f20f7ad69aaffd1ce14fe77de557b6df9b61e0c9e582f75a843715d836b5c8af", size = 6192759, upload-time = "2025-11-17T12:27:56.176Z" }, -] - [[package]] name = "cachetools" version = "6.2.6" @@ -262,15 +235,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/90/45/f458fa2c388e79dd9d8b9b0c99f1d31b568f27388f2fdba7bb66bbc0c6ed/cachetools-6.2.6-py3-none-any.whl", hash = "sha256:8c9717235b3c651603fff0076db52d6acbfd1b338b8ed50256092f7ce9c85bda", size = 11668, upload-time = "2026-01-27T20:32:58.527Z" }, ] -[[package]] -name = "catalogue" -version = "2.0.10" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/38/b4/244d58127e1cdf04cf2dc7d9566f0d24ef01d5ce21811bab088ecc62b5ea/catalogue-2.0.10.tar.gz", hash = "sha256:4f56daa940913d3f09d589c191c74e5a6d51762b3a9e37dd53b7437afd6cda15", size = 19561, upload-time = "2023-09-25T06:29:24.962Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9e/96/d32b941a501ab566a16358d68b6eb4e4acc373fab3c3c4d7d9e649f7b4bb/catalogue-2.0.10-py3-none-any.whl", hash = "sha256:58c2de0020aa90f4a2da7dfad161bf7b3b054c86a5f09fcedc0b2b740c109a9f", size = 17325, upload-time = "2023-09-25T06:29:23.337Z" }, -] - [[package]] name = "certifi" version = "2026.2.25" @@ -349,15 +313,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" }, ] -[[package]] -name = "cloudpathlib" -version = "0.23.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f4/18/2ac35d6b3015a0c74e923d94fc69baf8307f7c3233de015d69f99e17afa8/cloudpathlib-0.23.0.tar.gz", hash = "sha256:eb38a34c6b8a048ecfd2b2f60917f7cbad4a105b7c979196450c2f541f4d6b4b", size = 53126, upload-time = "2025-10-07T22:47:56.278Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ae/8a/c4bb04426d608be4a3171efa2e233d2c59a5c8937850c10d098e126df18e/cloudpathlib-0.23.0-py3-none-any.whl", hash = "sha256:8520b3b01468fee77de37ab5d50b1b524ea6b4a8731c35d1b7407ac0cd716002", size = 62755, upload-time = "2025-10-07T22:47:54.905Z" }, -] - [[package]] name = "cloudpickle" version = "3.1.2" @@ -409,19 +364,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl", hash = "sha256:c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417", size = 7294, upload-time = "2025-07-25T14:02:02.896Z" }, ] -[[package]] -name = "confection" -version = "0.1.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pydantic" }, - { name = "srsly" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/51/d3/57c6631159a1b48d273b40865c315cf51f89df7a9d1101094ef12e3a37c2/confection-0.1.5.tar.gz", hash = "sha256:8e72dd3ca6bd4f48913cd220f10b8275978e740411654b6e8ca6d7008c590f0e", size = 38924, upload-time = "2024-05-31T16:17:01.559Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0c/00/3106b1854b45bd0474ced037dfe6b73b90fe68a68968cef47c23de3d43d2/confection-0.1.5-py3-none-any.whl", hash = "sha256:e29d3c3f8eac06b3f77eb9dfb4bf2fc6bcc9622a98ca00a698e3d019c6430b14", size = 35451, upload-time = "2024-05-31T16:16:59.075Z" }, -] - [[package]] name = "contourpy" version = "1.3.3" @@ -496,22 +438,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" }, ] -[[package]] -name = "cymem" -version = "2.0.13" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c0/8f/2f0fbb32535c3731b7c2974c569fb9325e0a38ed5565a08e1139a3b71e82/cymem-2.0.13.tar.gz", hash = "sha256:1c91a92ae8c7104275ac26bd4d29b08ccd3e7faff5893d3858cb6fadf1bc1588", size = 12320, upload-time = "2025-11-14T14:58:36.902Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c9/52/478a2911ab5028cb710b4900d64aceba6f4f882fcb13fd8d40a456a1b6dc/cymem-2.0.13-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e8afbc5162a0fe14b6463e1c4e45248a1b2fe2cbcecc8a5b9e511117080da0eb", size = 43745, upload-time = "2025-11-14T14:57:32.52Z" }, - { url = "https://files.pythonhosted.org/packages/f9/71/f0f8adee945524774b16af326bd314a14a478ed369a728a22834e6785a18/cymem-2.0.13-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c9251d889348fe79a75e9b3e4d1b5fa651fca8a64500820685d73a3acc21b6a8", size = 42927, upload-time = "2025-11-14T14:57:33.827Z" }, - { url = "https://files.pythonhosted.org/packages/62/6d/159780fe162ff715d62b809246e5fc20901cef87ca28b67d255a8d741861/cymem-2.0.13-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:742fc19764467a49ed22e56a4d2134c262d73a6c635409584ae3bf9afa092c33", size = 258346, upload-time = "2025-11-14T14:57:34.917Z" }, - { url = "https://files.pythonhosted.org/packages/eb/12/678d16f7aa1996f947bf17b8cfb917ea9c9674ef5e2bd3690c04123d5680/cymem-2.0.13-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f190a92fe46197ee64d32560eb121c2809bb843341733227f51538ce77b3410d", size = 260843, upload-time = "2025-11-14T14:57:36.503Z" }, - { url = "https://files.pythonhosted.org/packages/31/5d/0dd8c167c08cd85e70d274b7235cfe1e31b3cebc99221178eaf4bbb95c6f/cymem-2.0.13-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d670329ee8dbbbf241b7c08069fe3f1d3a1a3e2d69c7d05ea008a7010d826298", size = 254607, upload-time = "2025-11-14T14:57:38.036Z" }, - { url = "https://files.pythonhosted.org/packages/b7/c9/d6514a412a1160aa65db539836b3d47f9b59f6675f294ec34ae32f867c82/cymem-2.0.13-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a84ba3178d9128b9ffb52ce81ebab456e9fe959125b51109f5b73ebdfc6b60d6", size = 262421, upload-time = "2025-11-14T14:57:39.265Z" }, - { url = "https://files.pythonhosted.org/packages/dd/fe/3ee37d02ca4040f2fb22d34eb415198f955862b5dd47eee01df4c8f5454c/cymem-2.0.13-cp312-cp312-win_amd64.whl", hash = "sha256:2ff1c41fd59b789579fdace78aa587c5fc091991fa59458c382b116fc36e30dc", size = 40176, upload-time = "2025-11-14T14:57:40.706Z" }, - { url = "https://files.pythonhosted.org/packages/94/fb/1b681635bfd5f2274d0caa8f934b58435db6c091b97f5593738065ddb786/cymem-2.0.13-cp312-cp312-win_arm64.whl", hash = "sha256:6bbd701338df7bf408648191dff52472a9b334f71bcd31a21a41d83821050f67", size = 35959, upload-time = "2025-11-14T14:57:41.682Z" }, -] - [[package]] name = "datasets" version = "2.19.1" @@ -1208,18 +1134,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/87/fb/99f81ac72ae23375f22b7afdb7642aba97c00a713c217124420147681a2f/mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59", size = 78509, upload-time = "2025-04-10T12:50:53.297Z" }, ] -[[package]] -name = "markdown-it-py" -version = "4.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mdurl" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, -] - [[package]] name = "markupsafe" version = "3.0.3" @@ -1288,15 +1202,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d7/9c/f8c498687901bf5c39e05015ea17e689d7f5e68b19a0d5e0c22edb309997/mdpd-0.2.1-py3-none-any.whl", hash = "sha256:7735776f1fa96ac318e89e536aa0e8a9a896163e042d57b1e1c43298d6f6fc26", size = 3449, upload-time = "2023-10-15T08:04:37.099Z" }, ] -[[package]] -name = "mdurl" -version = "0.1.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, -] - [[package]] name = "ml-dtypes" version = "0.5.4" @@ -1365,22 +1270,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3", size = 133351, upload-time = "2024-01-28T18:52:31.981Z" }, ] -[[package]] -name = "murmurhash" -version = "1.0.15" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/23/2e/88c147931ea9725d634840d538622e94122bceaf346233349b7b5c62964b/murmurhash-1.0.15.tar.gz", hash = "sha256:58e2b27b7847f9e2a6edf10b47a8c8dd70a4705f45dccb7bf76aeadacf56ba01", size = 13291, upload-time = "2025-11-14T09:51:15.272Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b6/46/be8522d3456fdccf1b8b049c6d82e7a3c1114c4fc2cfe14b04cba4b3e701/murmurhash-1.0.15-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d37e3ae44746bca80b1a917c2ea625cf216913564ed43f69d2888e5df97db0cb", size = 27884, upload-time = "2025-11-14T09:50:13.133Z" }, - { url = "https://files.pythonhosted.org/packages/ed/cc/630449bf4f6178d7daf948ce46ad00b25d279065fc30abd8d706be3d87e0/murmurhash-1.0.15-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0861cb11039409eaf46878456b7d985ef17b6b484103a6fc367b2ecec846891d", size = 27855, upload-time = "2025-11-14T09:50:14.859Z" }, - { url = "https://files.pythonhosted.org/packages/ff/30/ea8f601a9bf44db99468696efd59eb9cff1157cd55cb586d67116697583f/murmurhash-1.0.15-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5a301decfaccfec70fe55cb01dde2a012c3014a874542eaa7cc73477bb749616", size = 134088, upload-time = "2025-11-14T09:50:15.958Z" }, - { url = "https://files.pythonhosted.org/packages/c9/de/c40ce8c0877d406691e735b8d6e9c815f36a82b499d358313db5dbe219d7/murmurhash-1.0.15-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:32c6fde7bd7e9407003370a07b5f4addacabe1556ad3dc2cac246b7a2bba3400", size = 133978, upload-time = "2025-11-14T09:50:17.572Z" }, - { url = "https://files.pythonhosted.org/packages/47/84/bd49963ecd84ebab2fe66595e2d1ed41d5e8b5153af5dc930f0bd827007c/murmurhash-1.0.15-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5d8b43a7011540dc3c7ce66f2134df9732e2bc3bbb4a35f6458bc755e48bde26", size = 132956, upload-time = "2025-11-14T09:50:18.742Z" }, - { url = "https://files.pythonhosted.org/packages/4f/7c/2530769c545074417c862583f05f4245644599f1e9ff619b3dfe2969aafc/murmurhash-1.0.15-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:43bf4541892ecd95963fcd307bf1c575fc0fee1682f41c93007adee71ca2bb40", size = 134184, upload-time = "2025-11-14T09:50:19.941Z" }, - { url = "https://files.pythonhosted.org/packages/84/a4/b249b042f5afe34d14ada2dc4afc777e883c15863296756179652e081c44/murmurhash-1.0.15-cp312-cp312-win_amd64.whl", hash = "sha256:f4ac15a2089dc42e6eb0966622d42d2521590a12c92480aafecf34c085302cca", size = 25647, upload-time = "2025-11-14T09:50:21.049Z" }, - { url = "https://files.pythonhosted.org/packages/13/bf/028179259aebc18fd4ba5cae2601d1d47517427a537ab44336446431a215/murmurhash-1.0.15-cp312-cp312-win_arm64.whl", hash = "sha256:4a70ca4ae19e600d9be3da64d00710e79dde388a4d162f22078d64844d0ebdda", size = 23338, upload-time = "2025-11-14T09:50:22.359Z" }, -] - [[package]] name = "mypy" version = "1.19.1" @@ -1745,7 +1634,6 @@ dependencies = [ { name = "setuptools" }, { name = "skl2onnx" }, { name = "sklearn-pandas" }, - { name = "spacy" }, { name = "streamlit" }, { name = "tornado" }, { name = "transformers" }, @@ -1827,7 +1715,6 @@ requires-dist = [ { name = "setuptools", specifier = "==78.1.1" }, { name = "skl2onnx", specifier = ">=1.17.0" }, { name = "sklearn-pandas", specifier = "==2.2.0" }, - { name = "spacy", specifier = ">=3.0.0" }, { name = "streamlit", specifier = "==1.54.0" }, { name = "torch", marker = "extra == 'cpu'", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "open-autonlu", extra = "cpu" } }, { name = "torch", marker = "extra == 'cuda'", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cu124", conflict = { package = "open-autonlu", extra = "cuda" } }, @@ -2105,26 +1992,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1e/29/db1d855a661c02dbde5cab3057969133fcc62e7a0c6393e48fe9d0e81679/pre_commit_hooks-5.0.0-py2.py3-none-any.whl", hash = "sha256:8d71cfb582c5c314a5498d94e0104b6567a8b93fb35903ea845c491f4e290a7a", size = 41245, upload-time = "2024-10-05T18:43:09.901Z" }, ] -[[package]] -name = "preshed" -version = "3.0.12" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cymem" }, - { name = "murmurhash" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/bf/34/eb4f5f0f678e152a96e826da867d2f41c4b18a2d589e40e1dd3347219e91/preshed-3.0.12.tar.gz", hash = "sha256:b73f9a8b54ee1d44529cc6018356896cff93d48f755f29c134734d9371c0d685", size = 15027, upload-time = "2025-11-17T13:00:33.621Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4b/f7/ff3aca937eeaee19c52c45ddf92979546e52ed0686e58be4bc09c47e7d88/preshed-3.0.12-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2779861f5d69480493519ed123a622a13012d1182126779036b99d9d989bf7e9", size = 129958, upload-time = "2025-11-17T12:59:33.391Z" }, - { url = "https://files.pythonhosted.org/packages/80/24/fd654a9c0f5f3ed1a9b1d8a392f063ae9ca29ad0b462f0732ae0147f7cee/preshed-3.0.12-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ffe1fd7d92f51ed34383e20d8b734780c814ca869cfdb7e07f2d31651f90cdf4", size = 124550, upload-time = "2025-11-17T12:59:34.688Z" }, - { url = "https://files.pythonhosted.org/packages/71/49/8271c7f680696f4b0880f44357d2a903d649cb9f6e60a1efc97a203104df/preshed-3.0.12-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:91893404858502cc4e856d338fef3d2a4a552135f79a1041c24eb919817c19db", size = 874987, upload-time = "2025-11-17T12:59:36.062Z" }, - { url = "https://files.pythonhosted.org/packages/a3/a5/ca200187ca1632f1e2c458b72f1bd100fa8b55deecd5d72e1e4ebf09e98c/preshed-3.0.12-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9e06e8f2ba52f183eb9817a616cdebe84a211bb859a2ffbc23f3295d0b189638", size = 866499, upload-time = "2025-11-17T12:59:37.586Z" }, - { url = "https://files.pythonhosted.org/packages/87/a1/943b61f850c44899910c21996cb542d0ef5931744c6d492fdfdd8457e693/preshed-3.0.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bbe8b8a2d4f9af14e8a39ecca524b9de6defc91d8abcc95eb28f42da1c23272c", size = 878064, upload-time = "2025-11-17T12:59:39.651Z" }, - { url = "https://files.pythonhosted.org/packages/3e/75/d7fff7f1fa3763619aa85d6ba70493a5d9c6e6ea7958a6e8c9d3e6e88bbe/preshed-3.0.12-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5d0aaac9c5862f5471fddd0c931dc64d3af2efc5fe3eb48b50765adb571243b9", size = 900540, upload-time = "2025-11-17T12:59:41.384Z" }, - { url = "https://files.pythonhosted.org/packages/e4/12/a2285b78bd097a1e53fb90a1743bc8ce0d35e5b65b6853f3b3c47da398ca/preshed-3.0.12-cp312-cp312-win_amd64.whl", hash = "sha256:0eb8d411afcb1e3b12a0602fb6a0e33140342a732a795251a0ce452aba401dc0", size = 118298, upload-time = "2025-11-17T12:59:42.65Z" }, - { url = "https://files.pythonhosted.org/packages/0b/34/4e8443fe99206a2fcfc63659969a8f8c8ab184836533594a519f3899b1ad/preshed-3.0.12-cp312-cp312-win_arm64.whl", hash = "sha256:dcd3d12903c9f720a39a5c5f1339f7f46e3ab71279fb7a39776768fb840b6077", size = 104746, upload-time = "2025-11-17T12:59:43.934Z" }, -] - [[package]] name = "prompt-toolkit" version = "3.0.52" @@ -2559,19 +2426,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, ] -[[package]] -name = "rich" -version = "14.3.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "markdown-it-py" }, - { name = "pygments" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582, upload-time = "2026-02-19T17:23:12.474Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/14/25/b208c5683343959b670dc001595f2f3737e051da617f66c31f7c4fa93abc/rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d", size = 310458, upload-time = "2026-02-19T17:23:13.732Z" }, -] - [[package]] name = "roman-numerals" version = "4.1.0" @@ -2778,15 +2632,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/90/99/158ad0609729111163fc1f674a5a42f2605371a4cf036d0441070e2f7455/setuptools-78.1.1-py3-none-any.whl", hash = "sha256:c3a9c4211ff4c309edb8b8c4f1cbfa7ae324c4ba9f91ff254e3d305b9fd54561", size = 1256462, upload-time = "2025-04-19T18:23:34.525Z" }, ] -[[package]] -name = "shellingham" -version = "1.5.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload-time = "2023-10-24T04:13:40.426Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" }, -] - [[package]] name = "six" version = "1.17.0" @@ -2824,18 +2669,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/30/71/ccd5222f731993dfc1a6d9e766a507f1859bda4930b9548e54c11c876baf/sklearn_pandas-2.2.0-py2.py3-none-any.whl", hash = "sha256:9383e577be14448ee7f7d5f42e78b89394c1f71be43377d2b1bcbd876a94f629", size = 10980, upload-time = "2021-05-08T08:14:28.141Z" }, ] -[[package]] -name = "smart-open" -version = "7.5.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "wrapt" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/e8/be/a66598b305763861a9ab15ff0f2fbc44e47b1ce7a776797337a4eef37c66/smart_open-7.5.1.tar.gz", hash = "sha256:3f08e16827c4733699e6b2cc40328a3568f900cb12ad9a3ad233ba6c872d9fe7", size = 54034, upload-time = "2026-02-23T11:01:28.979Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5e/ea/dcdecd68acebb49d3fd560473a43499b1635076f7f1ae8641c060fe7ce74/smart_open-7.5.1-py3-none-any.whl", hash = "sha256:3e07cbbd9c8a908bcb8e25d48becf1a5cbb4886fa975e9f34c672ed171df2318", size = 64108, upload-time = "2026-02-23T11:01:27.429Z" }, -] - [[package]] name = "smmap" version = "5.0.2" @@ -2863,60 +2696,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c8/78/3565d011c61f5a43488987ee32b6f3f656e7f107ac2782dd57bdd7d91d9a/snowballstemmer-3.0.1-py3-none-any.whl", hash = "sha256:6cd7b3897da8d6c9ffb968a6781fa6532dce9c3618a4b127d920dab764a19064", size = 103274, upload-time = "2025-05-09T16:34:50.371Z" }, ] -[[package]] -name = "spacy" -version = "3.8.11" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "catalogue" }, - { name = "cymem" }, - { name = "jinja2" }, - { name = "murmurhash" }, - { name = "numpy" }, - { name = "packaging" }, - { name = "preshed" }, - { name = "pydantic" }, - { name = "requests" }, - { name = "setuptools" }, - { name = "spacy-legacy" }, - { name = "spacy-loggers" }, - { name = "srsly" }, - { name = "thinc" }, - { name = "tqdm" }, - { name = "typer-slim" }, - { name = "wasabi" }, - { name = "weasel" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/59/9f/424244b0e2656afc9ff82fb7a96931a47397bfce5ba382213827b198312a/spacy-3.8.11.tar.gz", hash = "sha256:54e1e87b74a2f9ea807ffd606166bf29ac45e2bd81ff7f608eadc7b05787d90d", size = 1326804, upload-time = "2025-11-17T20:40:03.079Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/51/fb/01eadf4ba70606b3054702dc41fc2ccf7d70fb14514b3cd57f0ff78ebea8/spacy-3.8.11-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:aa1ee8362074c30098feaaf2dd888c829a1a79c4311eec1b117a0a61f16fa6dd", size = 6073726, upload-time = "2025-11-17T20:39:01.679Z" }, - { url = "https://files.pythonhosted.org/packages/3a/f8/07b03a2997fc2621aaeafae00af50f55522304a7da6926b07027bb6d0709/spacy-3.8.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:75a036d04c2cf11d6cb566c0a689860cc5a7a75b439e8fea1b3a6b673dabf25d", size = 5724702, upload-time = "2025-11-17T20:39:03.486Z" }, - { url = "https://files.pythonhosted.org/packages/13/0c/c4fa0f379dbe3258c305d2e2df3760604a9fcd71b34f8f65c23e43f4cf55/spacy-3.8.11-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cb599d2747d4a59a5f90e8a453c149b13db382a8297925cf126333141dbc4f7", size = 32727774, upload-time = "2025-11-17T20:39:05.894Z" }, - { url = "https://files.pythonhosted.org/packages/ce/8e/6a4ba82bed480211ebdf5341b0f89e7271b454307525ac91b5e447825914/spacy-3.8.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:94632e302ad2fb79dc285bf1e9e4d4a178904d5c67049e0e02b7fb4a77af85c4", size = 33215053, upload-time = "2025-11-17T20:39:08.588Z" }, - { url = "https://files.pythonhosted.org/packages/a6/bc/44d863d248e9d7358c76a0aa8b3f196b8698df520650ed8de162e18fbffb/spacy-3.8.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:aeca6cf34009d48cda9fb1bbfb532469e3d643817241a73e367b34ab99a5806f", size = 32074195, upload-time = "2025-11-17T20:39:11.601Z" }, - { url = "https://files.pythonhosted.org/packages/6f/7d/0b115f3f16e1dd2d3f99b0f89497867fc11c41aed94f4b7a4367b4b54136/spacy-3.8.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:368a79b8df925b15d89dccb5e502039446fb2ce93cf3020e092d5b962c3349b9", size = 32996143, upload-time = "2025-11-17T20:39:14.705Z" }, - { url = "https://files.pythonhosted.org/packages/7d/48/7e9581b476df76aaf9ee182888d15322e77c38b0bbbd5e80160ba0bddd4c/spacy-3.8.11-cp312-cp312-win_amd64.whl", hash = "sha256:88d65941a87f58d75afca1785bd64d01183a92f7269dcbcf28bd9d6f6a77d1a7", size = 14217511, upload-time = "2025-11-17T20:39:17.316Z" }, - { url = "https://files.pythonhosted.org/packages/7b/1f/307a16f32f90aa5ee7ad8d29ff8620a57132b80a4c8c536963d46d192e1a/spacy-3.8.11-cp312-cp312-win_arm64.whl", hash = "sha256:97b865d6d3658e2ab103a67d6c8a2d678e193e84a07f40d9938565b669ceee39", size = 13614446, upload-time = "2025-11-17T20:39:19.748Z" }, -] - -[[package]] -name = "spacy-legacy" -version = "3.0.12" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d9/79/91f9d7cc8db5642acad830dcc4b49ba65a7790152832c4eceb305e46d681/spacy-legacy-3.0.12.tar.gz", hash = "sha256:b37d6e0c9b6e1d7ca1cf5bc7152ab64a4c4671f59c85adaf7a3fcb870357a774", size = 23806, upload-time = "2023-01-23T09:04:15.104Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c3/55/12e842c70ff8828e34e543a2c7176dac4da006ca6901c9e8b43efab8bc6b/spacy_legacy-3.0.12-py2.py3-none-any.whl", hash = "sha256:476e3bd0d05f8c339ed60f40986c07387c0a71479245d6d0f4298dbd52cda55f", size = 29971, upload-time = "2023-01-23T09:04:13.45Z" }, -] - -[[package]] -name = "spacy-loggers" -version = "1.0.5" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/67/3d/926db774c9c98acf66cb4ed7faf6c377746f3e00b84b700d0868b95d0712/spacy-loggers-1.0.5.tar.gz", hash = "sha256:d60b0bdbf915a60e516cc2e653baeff946f0cfc461b452d11a4d5458c6fe5f24", size = 20811, upload-time = "2023-09-11T12:26:52.323Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/33/78/d1a1a026ef3af911159398c939b1509d5c36fe524c7b644f34a5146c4e16/spacy_loggers-1.0.5-py3-none-any.whl", hash = "sha256:196284c9c446cc0cdb944005384270d775fdeaf4f494d8e269466cfa497ef645", size = 22343, upload-time = "2023-09-11T12:26:50.586Z" }, -] - [[package]] name = "sphinx" version = "9.1.0" @@ -3072,24 +2851,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/15/9f/7c378406b592fcf1fc157248607b495a40e3202ba4a6f1372a2ba6447717/sqlalchemy-2.0.47-py3-none-any.whl", hash = "sha256:e2647043599297a1ef10e720cf310846b7f31b6c841fee093d2b09d81215eb93", size = 1940159, upload-time = "2026-02-24T17:15:07.158Z" }, ] -[[package]] -name = "srsly" -version = "2.5.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "catalogue" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/cf/77/5633c4ba65e3421b72b5b4bd93aa328360b351b3a1e5bf3c90eb224668e5/srsly-2.5.2.tar.gz", hash = "sha256:4092bc843c71b7595c6c90a0302a197858c5b9fe43067f62ae6a45bc3baa1c19", size = 492055, upload-time = "2025-11-17T14:11:02.543Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8f/1c/21f658d98d602a559491b7886c7ca30245c2cd8987ff1b7709437c0f74b1/srsly-2.5.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6f92b4f883e6be4ca77f15980b45d394d310f24903e25e1b2c46df783c7edcce", size = 656161, upload-time = "2025-11-17T14:10:03.181Z" }, - { url = "https://files.pythonhosted.org/packages/2f/a2/bc6fd484ed703857043ae9abd6c9aea9152f9480a6961186ee6c1e0c49e8/srsly-2.5.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ac4790a54b00203f1af5495b6b8ac214131139427f30fcf05cf971dde81930eb", size = 653237, upload-time = "2025-11-17T14:10:04.636Z" }, - { url = "https://files.pythonhosted.org/packages/ab/ea/e3895da29a15c8d325e050ad68a0d1238eece1d2648305796adf98dcba66/srsly-2.5.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ce5c6b016050857a7dd365c9dcdd00d96e7ac26317cfcb175db387e403de05bf", size = 1174418, upload-time = "2025-11-17T14:10:05.945Z" }, - { url = "https://files.pythonhosted.org/packages/a6/a5/21996231f53ee97191d0746c3a672ba33a4d86a19ffad85a1c0096c91c5f/srsly-2.5.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:539c6d0016e91277b5e9be31ebed03f03c32580d49c960e4a92c9003baecf69e", size = 1183089, upload-time = "2025-11-17T14:10:07.335Z" }, - { url = "https://files.pythonhosted.org/packages/7b/df/eb17aa8e4a828e8df7aa7dc471295529d9126e6b710f1833ebe0d8568a8e/srsly-2.5.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9f24b2c4f4c29da04083f09158543eb3f8893ba0ac39818693b3b259ee8044f0", size = 1122594, upload-time = "2025-11-17T14:10:08.899Z" }, - { url = "https://files.pythonhosted.org/packages/80/74/1654a80e6c8ec3ee32370ea08a78d3651e0ba1c4d6e6be31c9efdb9a2d10/srsly-2.5.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d34675047460a3f6999e43478f40d9b43917ea1e93a75c41d05bf7648f3e872d", size = 1139594, upload-time = "2025-11-17T14:10:10.286Z" }, - { url = "https://files.pythonhosted.org/packages/73/aa/8393344ca7f0e81965febba07afc5cad68335ed0426408d480b861ab915b/srsly-2.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:81fd133ba3c66c07f0e3a889d2b4c852984d71ea833a665238a9d47d8e051ba5", size = 654750, upload-time = "2025-11-17T14:10:11.637Z" }, -] - [[package]] name = "stack-data" version = "0.6.3" @@ -3154,36 +2915,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d7/c1/eb8f9debc45d3b7918a32ab756658a0904732f75e555402972246b0b8e71/tenacity-9.1.4-py3-none-any.whl", hash = "sha256:6095a360c919085f28c6527de529e76a06ad89b23659fa881ae0649b867a9d55", size = 28926, upload-time = "2026-02-07T10:45:32.24Z" }, ] -[[package]] -name = "thinc" -version = "8.3.10" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "blis" }, - { name = "catalogue" }, - { name = "confection" }, - { name = "cymem" }, - { name = "murmurhash" }, - { name = "numpy" }, - { name = "packaging" }, - { name = "preshed" }, - { name = "pydantic" }, - { name = "setuptools" }, - { name = "srsly" }, - { name = "wasabi" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/2f/3a/2d0f0be132b9faaa6d56f04565ae122684273e4bf4eab8dee5f48dc00f68/thinc-8.3.10.tar.gz", hash = "sha256:5a75109f4ee1c968fc055ce651a17cb44b23b000d9e95f04a4d047ab3cb3e34e", size = 194196, upload-time = "2025-11-17T17:21:46.435Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d3/34/ba3b386d92edf50784b60ee34318d47c7f49c198268746ef7851c5bbe8cf/thinc-8.3.10-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:51bc6ef735bdbcab75ab2916731b8f61f94c66add6f9db213d900d3c6a244f95", size = 794509, upload-time = "2025-11-17T17:21:03.21Z" }, - { url = "https://files.pythonhosted.org/packages/07/f3/9f52d18115cd9d8d7b2590d226cb2752d2a5ffec61576b19462b48410184/thinc-8.3.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4f48b4d346915f98e9722c0c50ef911cc16c6790a2b7afebc6e1a2c96a6ce6c6", size = 741084, upload-time = "2025-11-17T17:21:04.568Z" }, - { url = "https://files.pythonhosted.org/packages/ad/9c/129c2b740c4e3d3624b6fb3dec1577ef27cb804bc1647f9bc3e1801ea20c/thinc-8.3.10-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5003f4db2db22cc8d686db8db83509acc3c50f4c55ebdcb2bbfcc1095096f7d2", size = 3846337, upload-time = "2025-11-17T17:21:06.079Z" }, - { url = "https://files.pythonhosted.org/packages/22/d2/738cf188dea8240c2be081c83ea47270fea585eba446171757d2cdb9b675/thinc-8.3.10-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b12484c3ed0632331fada2c334680dd6bc35972d0717343432dfc701f04a9b4c", size = 3901216, upload-time = "2025-11-17T17:21:07.842Z" }, - { url = "https://files.pythonhosted.org/packages/22/92/32f66eb9b1a29b797bf378a0874615d810d79eefca1d6c736c5ca3f8b918/thinc-8.3.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8677c446d3f9b97a465472c58683b785b25dfcf26c683e3f4e8f8c7c188e4362", size = 4827286, upload-time = "2025-11-17T17:21:09.62Z" }, - { url = "https://files.pythonhosted.org/packages/c4/5f/7ceae1e1f2029efd67ed88e23cd6dc13a5ee647cdc2b35113101b2a62c10/thinc-8.3.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:759c385ac08dcf950238b60b96a28f9c04618861141766928dff4a51b1679b25", size = 5024421, upload-time = "2025-11-17T17:21:11.199Z" }, - { url = "https://files.pythonhosted.org/packages/0b/66/30f9d8d41049b78bc614213d492792fbcfeb1b28642adf661c42110a7ebd/thinc-8.3.10-cp312-cp312-win_amd64.whl", hash = "sha256:bf3f188c3fa1fdcefd547d1f90a1245c29025d6d0e3f71d7fdf21dad210b990c", size = 1718631, upload-time = "2025-11-17T17:21:12.965Z" }, - { url = "https://files.pythonhosted.org/packages/f8/44/32e2a5018a1165a304d25eb9b1c74e5310da19a533a35331e8d824dc6a88/thinc-8.3.10-cp312-cp312-win_arm64.whl", hash = "sha256:234b7e57a6ef4e0260d99f4e8fdc328ed12d0ba9bbd98fdaa567294a17700d1c", size = 1642224, upload-time = "2025-11-17T17:21:14.371Z" }, -] - [[package]] name = "threadpoolctl" version = "3.6.0" @@ -3448,33 +3179,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/06/00/59500052cb1cf8cf5316be93598946bc451f14072c6ff256904428eaf03c/triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d9b215efc1c26fa7eefb9a157915c92d52e000d2bf83e5f69704047e63f125c", size = 253159365, upload-time = "2025-01-22T19:13:24.648Z" }, ] -[[package]] -name = "typer" -version = "0.24.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "annotated-doc" }, - { name = "click" }, - { name = "rich" }, - { name = "shellingham" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f5/24/cb09efec5cc954f7f9b930bf8279447d24618bb6758d4f6adf2574c41780/typer-0.24.1.tar.gz", hash = "sha256:e39b4732d65fbdcde189ae76cf7cd48aeae72919dea1fdfc16593be016256b45", size = 118613, upload-time = "2026-02-21T16:54:40.609Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4a/91/48db081e7a63bb37284f9fbcefda7c44c277b18b0e13fbc36ea2335b71e6/typer-0.24.1-py3-none-any.whl", hash = "sha256:112c1f0ce578bfb4cab9ffdabc68f031416ebcc216536611ba21f04e9aa84c9e", size = 56085, upload-time = "2026-02-21T16:54:41.616Z" }, -] - -[[package]] -name = "typer-slim" -version = "0.24.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typer" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a7/a7/e6aecc4b4eb59598829a3b5076a93aff291b4fdaa2ded25efc4e1f4d219c/typer_slim-0.24.0.tar.gz", hash = "sha256:f0ed36127183f52ae6ced2ecb2521789995992c521a46083bfcdbb652d22ad34", size = 4776, upload-time = "2026-02-16T22:08:51.2Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a7/24/5480c20380dfd18cf33d14784096dca45a24eae6102e91d49a718d3b6855/typer_slim-0.24.0-py3-none-any.whl", hash = "sha256:d5d7ee1ee2834d5020c7c616ed5e0d0f29b9a4b1dd283bdebae198ec09778d0e", size = 3394, upload-time = "2026-02-16T22:08:49.92Z" }, -] - [[package]] name = "typing-extensions" version = "4.12.2" @@ -3529,18 +3233,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/29/d1/3f62e4f9577b28c352c11623a03fb916096d5c131303d4861b4914481b6b/virtualenv-21.0.0-py3-none-any.whl", hash = "sha256:d44e70637402c7f4b10f48491c02a6397a3a187152a70cba0b6bc7642d69fb05", size = 5817167, upload-time = "2026-02-25T20:21:05.476Z" }, ] -[[package]] -name = "wasabi" -version = "1.1.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32' or (extra == 'extra-12-open-autonlu-cpu' and extra == 'extra-12-open-autonlu-cuda')" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/ac/f9/054e6e2f1071e963b5e746b48d1e3727470b2a490834d18ad92364929db3/wasabi-1.1.3.tar.gz", hash = "sha256:4bb3008f003809db0c3e28b4daf20906ea871a2bb43f9914197d540f4f2e0878", size = 30391, upload-time = "2024-05-31T16:56:18.99Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/06/7c/34330a89da55610daa5f245ddce5aab81244321101614751e7537f125133/wasabi-1.1.3-py3-none-any.whl", hash = "sha256:f76e16e8f7e79f8c4c8be49b4024ac725713ab10cd7f19350ad18a8e3f71728c", size = 27880, upload-time = "2024-05-31T16:56:16.699Z" }, -] - [[package]] name = "watchdog" version = "6.0.0" @@ -3571,26 +3263,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/68/5a/199c59e0a824a3db2b89c5d2dade7ab5f9624dbf6448dc291b46d5ec94d3/wcwidth-0.6.0-py3-none-any.whl", hash = "sha256:1a3a1e510b553315f8e146c54764f4fb6264ffad731b3d78088cdb1478ffbdad", size = 94189, upload-time = "2026-02-06T19:19:39.646Z" }, ] -[[package]] -name = "weasel" -version = "0.4.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cloudpathlib" }, - { name = "confection" }, - { name = "packaging" }, - { name = "pydantic" }, - { name = "requests" }, - { name = "smart-open" }, - { name = "srsly" }, - { name = "typer-slim" }, - { name = "wasabi" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/09/d7/edd9c24e60cf8e5de130aa2e8af3b01521f4d0216c371d01212f580d0d8e/weasel-0.4.3.tar.gz", hash = "sha256:f293d6174398e8f478c78481e00c503ee4b82ea7a3e6d0d6a01e46a6b1396845", size = 38733, upload-time = "2025-11-13T23:52:28.193Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a4/74/a148b41572656904a39dfcfed3f84dd1066014eed94e209223ae8e9d088d/weasel-0.4.3-py3-none-any.whl", hash = "sha256:08f65b5d0dbded4879e08a64882de9b9514753d9eaa4c4e2a576e33666ac12cf", size = 50757, upload-time = "2025-11-13T23:52:26.982Z" }, -] - [[package]] name = "wrapt" version = "2.1.1"