From 1b7f254c4729f06a3fa4070fb1e69afa207374ef Mon Sep 17 00:00:00 2001 From: Sveinbjorn Thordarson Date: Fri, 29 Aug 2025 11:45:44 +0000 Subject: [PATCH 1/5] CI: Test on CPython 3.14, PyPy-3.11 + ruff linting --- .github/workflows/python-package.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 4ad8f0f..00e75b2 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,7 +15,7 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: [ "3.9", "3.13", "pypy-3.9", "pypy-3.10"] + python-version: [ "3.9", "3.13", "3.14.0-beta.4", "pypy-3.9", "pypy-3.11"] steps: - uses: actions/checkout@v4 @@ -33,9 +33,13 @@ jobs: else uv pip install --system -e ".[dev]" fi + - name: Lint with ruff + run: | + if [ "${{ matrix.python-version }}" == "3.9" ]; then uv pip install --system ruff; fi + if [ "${{ matrix.python-version }}" == "3.9" ]; then ruff check src/reynir_correct; fi - name: Typecheck with mypy run: | - if [ "${{ matrix.python-version }}" == "3.9" ]; then python -m pip install mypy; fi + if [ "${{ matrix.python-version }}" == "3.9" ]; then uv pip install --system mypy; fi if [ "${{ matrix.python-version }}" == "3.9" ]; then mypy --ignore-missing-imports --python-version=3.9 src/reynir_correct; fi - name: Test with pytest run: | From 54b0ed5654ffb8f88302ccc0d86eef121d0820ee Mon Sep 17 00:00:00 2001 From: Sveinbjorn Thordarson Date: Fri, 29 Aug 2025 11:46:13 +0000 Subject: [PATCH 2/5] Explicitly support Python 3.14, version bump, ruff linting configuration --- pyproject.toml | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5d104db..e18c233 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "reynir-correct" -version = "4.1.0" +version = "4.1.1" description = "Spelling and grammar correction for Icelandic" authors = [{ name = "Miðeind ehf.", email = "mideind@mideind.is" }] readme = { file = "README.rst", content-type = "text/x-rst" } @@ -22,6 +22,7 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", "Topic :: Software Development :: Libraries :: Python Modules", @@ -61,11 +62,17 @@ filterwarnings = [ [tool.ruff] line-length = 120 -[tool.black] -line-length = 120 +[tool.ruff.lint] +#select = ["ALL"] # We use default rules for now +extend-select = ["E501"] # Complain about line length +# Ignore specific rules +# (we should aim to have these as few as possible) +ignore = [ + "F405", # 'F405: Name may be undefined, or defined from star imports: typing' + "E731", # 'E731: Do not assign a lambda expression, use a def' +] [tool.isort] # This forces these imports to placed at the top known_future_library = ["__future__", "typing", "typing_extensions"] -profile = "black" line_length = 120 From dc4527b3aa5e22d92441790c29399cf939f073ad Mon Sep 17 00:00:00 2001 From: Sveinbjorn Thordarson Date: Fri, 29 Aug 2025 13:27:46 +0000 Subject: [PATCH 3/5] Disabling "3.14.0-beta.4" CI + disabling ruff line length complaints --- .github/workflows/python-package.yml | 2 +- pyproject.toml | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 00e75b2..999ae8d 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,7 +15,7 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: [ "3.9", "3.13", "3.14.0-beta.4", "pypy-3.9", "pypy-3.11"] + python-version: [ "3.9", "3.13", "pypy-3.9", "pypy-3.11"] # "3.14.0-beta.4" steps: - uses: actions/checkout@v4 diff --git a/pyproject.toml b/pyproject.toml index e18c233..dcbea62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,11 +64,10 @@ line-length = 120 [tool.ruff.lint] #select = ["ALL"] # We use default rules for now -extend-select = ["E501"] # Complain about line length +# extend-select = ["E501"] # Complain about line length # Ignore specific rules # (we should aim to have these as few as possible) ignore = [ - "F405", # 'F405: Name may be undefined, or defined from star imports: typing' "E731", # 'E731: Do not assign a lambda expression, use a def' ] From faebcff77e230add6151c4a053b10789857ab8d6 Mon Sep 17 00:00:00 2001 From: Sveinbjorn Thordarson Date: Fri, 29 Aug 2025 13:39:24 +0000 Subject: [PATCH 4/5] Silencing ridiculous mypy complaints in errtokenizer --- src/reynir_correct/errtokenizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/reynir_correct/errtokenizer.py b/src/reynir_correct/errtokenizer.py index fbf52aa..fd65e59 100644 --- a/src/reynir_correct/errtokenizer.py +++ b/src/reynir_correct/errtokenizer.py @@ -372,7 +372,7 @@ def word( def __repr__(self) -> str: return "".format( - TOK.descr[self.kind], self.txt, self.val, self.original + TOK.descr[self.kind], self.txt, self.val, self.original # type: ignore ) __str__ = __repr__ @@ -383,7 +383,7 @@ def concatenate(self, other: Tok, *, separator: str = "", metadata_from_other: b self_txt = self.txt or "" other_txt = other.txt or "" new_txt = self_txt + separator + other_txt - self_original = self.original or "" + self_original = self.original or "" # type: ignore other_original = other.original or "" new_original = self_original + other_original @@ -433,7 +433,7 @@ def copy(self, other: Union[Tok, Sequence[Tok]], coalesce: bool = False) -> bool from another CorrectToken instance""" if isinstance(other, CorrectToken): self._err = other._err - self.original = other.original + self.original = other.original # type: ignore if coalesce and other.error_span > 1: # The original token had an associated error # spanning more than one token; now we're creating From 4f7cdfb0dac28ccb5cc3c058be182b0027188f0d Mon Sep 17 00:00:00 2001 From: Sveinbjorn Thordarson Date: Fri, 29 Aug 2025 14:48:25 +0000 Subject: [PATCH 5/5] Migrated README from RST to GitHub-flavored Markdown + rm explicit Python 3.14 support in project metadata until Icegrams dependency issue is resolved --- README.md | 374 +++++++++++++++++++++++++++++++++++++++++ README.rst | 442 ------------------------------------------------- pyproject.toml | 1 - 3 files changed, 374 insertions(+), 443 deletions(-) create mode 100644 README.md delete mode 100644 README.rst diff --git a/README.md b/README.md new file mode 100644 index 0000000..4b98a92 --- /dev/null +++ b/README.md @@ -0,0 +1,374 @@ + +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) +[![Python 3.9+](https://img.shields.io/badge/python-3.9-blue.svg)](https://www.python.org/downloads/release/python-390/) +[![PyPI version](https://img.shields.io/pypi/v/reynir-correct)](https://pypi.org/project/reynir-correct/) +[![GitHub release](https://shields.io/github/v/release/mideind/GreynirCorrect?display_name=tag)](https://github.com/mideind/GreynirCorrect/releases) +[![Python package](https://github.com/mideind/GreynirCorrect/actions/workflows/python-package.yml/badge.svg)](https://github.com/mideind/GreynirCorrect/actions?query=workflow%3A%22Python+package%22) + +# GreynirCorrect: Spelling and grammar correction for Icelandic + +## Overview + +**GreynirCorrect** is a Python 3 (>=3.9) package and command line tool for +**checking and correcting spelling and grammar** in Icelandic text. + +GreynirCorrect relies on the [Greynir](https://pypi.org/project/reynir/) package, +by the same authors, to tokenize and parse text. + +GreynirCorrect is documented in detail [here](https://yfirlestur.is/doc/). + +The software has three main modes of operation, described below. + +As a fourth alternative, you can call the JSON REST API +of [Yfirlestur.is](https://yfirlestur.is) +to apply the GreynirCorrect spelling and grammar engine to your text, +as [documented here](https://github.com/mideind/Yfirlestur#https-api). + +### Token-level correction + +GreynirCorrect can tokenize text and return an automatically corrected token stream. +This catches token-level errors, such as spelling errors and erroneous +phrases, but not grammatical errors. Token-level correction is relatively fast. + +### Full grammar analysis + +GreynirCorrect can analyze text grammatically by attempting to parse +it, after token-level correction. The parsing is done according to Greynir's +context-free grammar for Icelandic, augmented with additional production +rules for common grammatical errors. The analysis returns a set of annotations +(errors and suggestions) that apply to spans (consecutive tokens) within +sentences in the resulting token list. Full grammar analysis is considerably +slower than token-level correction. + +### Command-line tool + +GreynirCorrect can be invoked as a command-line tool +to perform token-level correction and, optionally, grammar analysis. +The command is `correct infile.txt outfile.txt`. +The command-line tool is further documented below. + +## Examples + +To perform token-level correction from Python code: + +```python +>>> from reynir_correct import tokenize +>>> g = tokenize("Af gefnu tilefni fékk fékk daninn vilja sýnum " +>>> "framgengt í auknu mæli.") +>>> for tok in g: +>>> print("{0:10} {1}".format(tok.txt or "", tok.error_description)) +``` + +Output: + +``` +Að Orðasambandið 'Af gefnu tilefni' var leiðrétt í 'að gefnu tilefni' +gefnu +tilefni +fékk Endurtekið orð ('fékk') var fellt burt +Daninn Orð á að byrja á hástaf: 'daninn' +vilja Orðasambandið 'vilja sýnum framgengt' var leiðrétt í 'vilja sínum framgengt' +sínum +framgengt +í Orðasambandið 'í auknu mæli' var leiðrétt í 'í auknum mæli' +auknum +mæli +. +``` + +To perform full spelling and grammar analysis of a sentence from Python code: + +```python +from reynir_correct import check_single +sent = check_single("Páli, vini mínum, langaði að horfa á sjónnvarpið.") +for annotation in sent.annotations: + print("{0}".format(annotation)) +``` + +Output: + +``` +000-004: P_WRONG_CASE_þgf_þf Á líklega að vera 'Pál, vin minn' / [Pál , vin minn] +009-009: S004 Orðið 'sjónnvarpið' var leiðrétt í 'sjónvarpið' +``` + +```python +sent.tidy_text +``` + +Output: + +``` +'Páli, vini mínum, langaði að horfa á sjónvarpið.' +``` + +The `annotation.start` and `annotation.end` properties +(here `start` is 0 and `end` is 4) contain the 0-based indices of the first +and last tokens to which the annotation applies. +The `annotation.start_char` and `annotation.end_char` properties +contain the indices of the first and last character to which the +annotation applies, within the original input string. + +`P_WRONG_CASE_þgf_þf` and `S004` are error codes. + +For more detailed, low-level control, the `check_errors()` function +supports options and can produce various types of output: + +```python +from reynir_correct import check_errors +x = "Páli, vini mínum, langaði að horfa á sjónnvarpið." +options = { "input": x, "annotations": True, "format": "text" } +s = check_errors(**options) +for i in s.split("\n"): + print(i) +``` + +Output: + +``` +Pál, vin minn, langaði að horfa á sjónvarpið. +000-004: P_WRONG_CASE_þgf_þf Á líklega að vera 'Pál, vin minn' | 'Páli, vini mínum,' -> 'Pál, vin minn' | None +009-009: S004 Orðið 'sjónnvarpið' var leiðrétt í 'sjónvarpið' | 'sjónnvarpið' -> 'sjónvarpið' | None +``` + +The following options can be specified: + +| Option | Description | Default value | +|---|---|---| +| `input` | Defines the input. Can be a string or an iterable of strings, such as a file object. | `sys.stdin` | +| `all_errors` (alias `grammar`) | Defines the level of correction. If False, only token-level annotation is carried out. If True, sentence-level annotation is carried out. | `True` | +| `annotate_unparsed_sentences` | If True, sentences that cannot be parsed are annotated in their entirety as errors. | `True` | +| `generate_suggestion_list` | If True, annotations can in certain cases contain a list of possible corrections, for the user to pick from. | `False` | +| `suppress_suggestions` | If True, more farfetched automatically suggested corrections are suppressed. | `False` | +| `ignore_wordlist` | The value is a set of strings to whitelist. Each string is a word that should not be marked as an error or corrected. The comparison is case-sensitive. | `set()` | +| `one_sent` | The input contains a single sentence only. Sentence splitting should not be attempted. | `False` | +| `ignore_rules` | A set of error codes that should be ignored in the annotation process. | `set()` | +| `tov_config` | Path to an additional configuration file that may be provided for correcting custom tone-of-voice issues. | `False` | + +An overview of error codes is available [here](https://github.com/mideind/GreynirCorrect/blob/master/doc/errorcodes.rst). + +## Prerequisites + +GreynirCorrect runs on CPython 3.9 or newer, and on PyPy 3.9 or newer. It has +been tested on Linux, macOS and Windows. The +[PyPi package](https://pypi.org/project/reynir-correct/) +includes binary wheels for common environments, but if the setup on your OS +requires compilation from sources, you may need + +```bash +$ sudo apt-get install python3-dev +``` + +...or something to similar effect to enable this. + +## Installation + +To install this package (assuming you have Python >= 3.9 with `pip` installed): + +```bash +$ pip install reynir-correct +``` + +If you want to be able to edit the source, do like so +(assuming you have `git` installed): + +```bash +$ git clone https://github.com/mideind/GreynirCorrect +$ cd GreynirCorrect +$ # [ Activate your virtualenv here if you have one ] +$ pip install -e . +``` + +The package source code is now in `GreynirCorrect/src/reynir_correct`. + +## The command line tool + +After installation, the corrector can be invoked directly from the command line: + +```bash +$ correct input.txt output.txt +``` + +...or: + +```bash +$ echo "Þinngið samþikkti tilöguna" | correct +Þingið samþykkti tillöguna +``` + +Input and output files are encoded in UTF-8. If the files are not +given explicitly, `stdin` and `stdout` are used for input and output, +respectively. + +Empty lines in the input are treated as sentence boundaries. + +By default, the output consists of one sentence per line, where each +line ends with a single newline character (ASCII LF, `chr(10)`, `"\n"`). +Within each line, tokens are separated by spaces. + +The following (mutually exclusive) options can be specified +on the command line: + +| Option | Description | +|---|---| +| `--csv` | Output token objects in CSV format, one per line. Sentences are separated by lines containing `0,"",""` | +| `--json` | Output token objects in JSON format, one per line.| +| `--normalize` | Normalize punctuation, causing e.g. quotes to be output in Icelandic form and hyphens to be regularized. | +| `--grammar` | Output whole-sentence annotations, including corrections and suggestions for spelling and grammar. Each sentence in the input is output as a text line containing a JSON object, terminated by a newline. | + +The CSV and JSON formats of token objects are identical to those documented +for the [Tokenizer package](https://github.com/mideind/Tokenizer). + +The JSON format of whole-sentence annotations is identical to the one documented for +the [Yfirlestur.is HTTPS REST API](https://github.com/mideind/Yfirlestur#https-api). + +Type `correct -h` to get a short help message. + +### Command Line Examples + +```bash +$ echo "Atvinuleysi jógst um 3%" | correct +Atvinnuleysi jókst um 3% +``` + +```bash +$ echo "Barnið vil grænann lit" | correct --csv +6,"Barnið","" +6,"vil","" +6,"grænan","" +6,"lit","" +0,"","" +``` + +Note how *vil* is not corrected, as it is a valid and common word, and +the `correct` command does not perform grammar checking by default. + +```bash +$ echo "Pakkin er fyrir hestin" | correct --json +{"k":"BEGIN SENT"} +{"k":"WORD","t":"Pakkinn"} +{"k":"WORD","t":"er"} +{"k":"WORD","t":"fyrir"} +{"k":"WORD","t":"hestinn"} +{"k":"END SENT"} +``` + +To perform whole-sentence grammar checking and annotation as well as spell checking, +use the `--grammar` option: + +```bash +$ echo "Ég kláraði verkefnið þrátt fyrir að ég var þreittur." | correct --grammar +{ + "original":"Ég kláraði verkefnið þrátt fyrir að ég var þreittur.", + "corrected":"Ég kláraði verkefnið þrátt fyrir að ég var þreyttur.", + "tokens":[ + {"k":6,"x":"Ég","o":"Ég"}, + {"k":6,"x":"kláraði","o":" kláraði"}, + {"k":6,"x":"verkefnið","o":" verkefnið"}, + {"k":6,"x":"þrátt fyrir","o":" þrátt fyrir"}, + {"k":6,"x":"að","o":" að"}, + {"k":6,"x":"ég","o":" ég"}, + {"k":6,"x":"var","o":" var"}, + {"k":6,"x":"þreyttur","o":" þreittur"}, + {"k":1,"x":".","o":"."} + ], + "annotations":[ + { + "start":6, + "end":6, + "start_char":35, + "end_char":37, + "code":"P_MOOD_ACK", + "text":"Hér er réttara að nota viðtengingarhátt\n sagnarinnar 'vera', þ.e. 'væri'.", + "detail":"Í viðurkenningarsetningum á borð við 'Z'\n í dæminu 'X gerði Y þrátt fyrir að Z' á sögnin að vera + í viðtengingarhætti fremur en framsöguhætti.", + "suggest":"væri" + }, + { + "start":7, + "end":7, + "start_char":38, + "end_char":41, + "code":"S004", + "text":"Orðið 'þreittur' var leiðrétt í 'þreyttur'", + "detail":"", + "suggest":"þreyttur" + } + ] +} +``` + +The output has been formatted for legibility - each input sentence is actually +represented by a JSON object in a single line of text, terminated by newline. + +Note that the `corrected` field only includes token-level spelling correction +(in this case *þreittur* `->` *þreyttur*), but no grammar corrections. +The grammar corrections are found in the `annotations` list. +To apply corrections and suggestions from the annotations, +replace source text or tokens (as identified by the `start` and `end`, +or `start_char` and `end_char` properties) with the `suggest` field, if present. + +## Tests + +To run the built-in tests, install [pytest](https://docs.pytest.org/en/latest/), +`cd` to your `GreynirCorrect` subdirectory (and optionally activate your +virtualenv), then run: + +```bash +$ python -m pytest +``` + +## Acknowledgements + +Parts of this software are developed under the auspices of the +Icelandic Government's 5-year Language Technology Programme for Icelandic, +which is managed by Almannarómur and described +[here](https://www.stjornarradid.is/lisalib/getfile.aspx?itemid=56f6368e-54f0-11e7-941a-005056bc530c) +(English version [here](https://clarin.is/media/uploads/mlt-en.pdf)). + +## Copyright and License + +[![Miðeind ehf.](https://github.com/mideind/GreynirPackage/raw/master/doc/_static/MideindLogoVert100.png?raw=true)](https://mideind.is) + +**Copyright © 2018-2025 Miðeind ehf.** + +GreynirCorrect's original author is *Vilhjálmur Þorsteinsson*. + +This software is licensed under the *MIT License*: + + *Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without restriction, + including without limitation the rights to use, copy, modify, merge, + publish, distribute, sublicense, and/or sell copies of the Software, + and to permit persons to whom the Software is furnished to do so, + subject to the following conditions:* + + *The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software.* + + *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.* + +---- + +GreynirCorrect indirectly embeds the [Database of Icelandic Morphology](https://bin.arnastofnun.is) +([Beygingarlýsing íslensks nútímamáls](https://bin.arnastofnun.is)) +along with directly using +[Ritmyndir](https://bin.arnastofnun.is/DMII/LTdata/comp-format/nonstand-form/), +a collection of non-standard word forms. +Miðeind does not claim any endorsement by the BÍN authors or copyright holders. + +The BÍN source data are publicly available under the +[CC BY-SA 4.0 license](https://creativecommons.org/licenses/by-sa/4.0/), as further +detailed [here in English](https://bin.arnastofnun.is/DMII/LTdata/conditions/) +and [here in Icelandic](https://bin.arnastofnun.is/gogn/mimisbrunnur/). + +In accordance with the BÍN license terms, credit is hereby given as follows: + +*Beygingarlýsing íslensks nútímamáls. Stofnun Árna Magnússonar í íslenskum fræðum.* +*Höfundur og ritstjóri Kristín Bjarnadóttir.* + diff --git a/README.rst b/README.rst deleted file mode 100644 index fcf4f14..0000000 --- a/README.rst +++ /dev/null @@ -1,442 +0,0 @@ - -.. image:: https://img.shields.io/badge/License-MIT-yellow.svg - :target: https://opensource.org/licenses/MIT -.. image:: https://img.shields.io/badge/python-3.9-blue.svg - :target: https://www.python.org/downloads/release/python-390/ -.. image:: https://img.shields.io/pypi/v/reynir-correct - :target: https://pypi.org/project/reynir-correct/ -.. image:: https://shields.io/github/v/release/mideind/GreynirCorrect?display_name=tag - :target: https://github.com/mideind/GreynirCorrect/releases -.. image:: https://github.com/mideind/GreynirCorrect/actions/workflows/python-package.yml/badge.svg - :target: https://github.com/mideind/GreynirCorrect/actions?query=workflow%3A%22Python+package%22 - -============================================================== -GreynirCorrect: Spelling and grammar correction for Icelandic -============================================================== - -******** -Overview -******** - -**GreynirCorrect** is a Python 3 (>=3.9) package and command line tool for -**checking and correcting spelling and grammar** in Icelandic text. - -GreynirCorrect relies on the `Greynir `__ package, -by the same authors, to tokenize and parse text. - -GreynirCorrect is documented in detail `here `__. - -The software has three main modes of operation, described below. - -As a fourth alternative, you can call the JSON REST API -of `Yfirlestur.is `__ -to apply the GreynirCorrect spelling and grammar engine to your text, -as `documented here `__. - -Token-level correction ----------------------- - -GreynirCorrect can tokenize text and return an automatically corrected token stream. -This catches token-level errors, such as spelling errors and erroneous -phrases, but not grammatical errors. Token-level correction is relatively fast. - -Full grammar analysis ---------------------- - -GreynirCorrect can analyze text grammatically by attempting to parse -it, after token-level correction. The parsing is done according to Greynir's -context-free grammar for Icelandic, augmented with additional production -rules for common grammatical errors. The analysis returns a set of annotations -(errors and suggestions) that apply to spans (consecutive tokens) within -sentences in the resulting token list. Full grammar analysis is considerably -slower than token-level correction. - -Command-line tool ------------------ - -GreynirCorrect can be invoked as a command-line tool -to perform token-level correction and, optionally, grammar analysis. -The command is ``correct infile.txt outfile.txt``. -The command-line tool is further documented below. - -******** -Examples -******** - -To perform token-level correction from Python code: - -.. code-block:: python - - >>> from reynir_correct import tokenize - >>> g = tokenize("Af gefnu tilefni fékk fékk daninn vilja sýnum " - >>> "framgengt í auknu mæli.") - >>> for tok in g: - >>> print("{0:10} {1}".format(tok.txt or "", tok.error_description)) - -Output:: - - Að Orðasambandið 'Af gefnu tilefni' var leiðrétt í 'að gefnu tilefni' - gefnu - tilefni - fékk Endurtekið orð ('fékk') var fellt burt - Daninn Orð á að byrja á hástaf: 'daninn' - vilja Orðasambandið 'vilja sýnum framgengt' var leiðrétt í 'vilja sínum framgengt' - sínum - framgengt - í Orðasambandið 'í auknu mæli' var leiðrétt í 'í auknum mæli' - auknum - mæli - . - -To perform full spelling and grammar analysis of a sentence from Python code: - -.. code-block:: python - - from reynir_correct import check_single - sent = check_single("Páli, vini mínum, langaði að horfa á sjónnvarpið.") - for annotation in sent.annotations: - print("{0}".format(annotation)) - -Output:: - - 000-004: P_WRONG_CASE_þgf_þf Á líklega að vera 'Pál, vin minn' / [Pál , vin minn] - 009-009: S004 Orðið 'sjónnvarpið' var leiðrétt í 'sjónvarpið' - -.. code-block:: python - - sent.tidy_text - -Output:: - - 'Páli, vini mínum, langaði að horfa á sjónvarpið.' - -The ``annotation.start`` and ``annotation.end`` properties -(here ``start`` is 0 and ``end`` is 4) contain the 0-based indices of the first -and last tokens to which the annotation applies. -The ``annotation.start_char`` and ``annotation.end_char`` properties -contain the indices of the first and last character to which the -annotation applies, within the original input string. - -``P_WRONG_CASE_þgf_þf`` and ``S004`` are error codes. - -For more detailed, low-level control, the ``check_errors()`` function -supports options and can produce various types of output: - -.. code-block:: python - - from reynir_correct import check_errors - x = "Páli, vini mínum, langaði að horfa á sjónnvarpið." - options = { "input": x, "annotations": True, "format": "text" } - s = check_errors(**options) - for i in s.split("\n"): - print(i) - -Output:: - - Pál, vin minn, langaði að horfa á sjónvarpið. - 000-004: P_WRONG_CASE_þgf_þf Á líklega að vera 'Pál, vin minn' | 'Páli, vini mínum,' -> 'Pál, vin minn' | None - 009-009: S004 Orðið 'sjónnvarpið' var leiðrétt í 'sjónvarpið' | 'sjónnvarpið' -> 'sjónvarpið' | None - - -The following options can be specified: - -+-----------------------------------+--------------------------------------------------+-----------------+ -| | Option | Description | Default value | -+-----------------------------------+--------------------------------------------------+-----------------+ -| | ``input`` | Defines the input. Can be a string or an | ``sys.stdin`` | -| | iterable of strings, such as a file object. | | -+-----------------------------------+--------------------------------------------------+-----------------+ -| | ``all_errors`` | Defines the level of correction. | ``True`` | -| | (alias ``grammar``) | If False, only token-level annotation is | | -| | carried out. If True, sentence-level | | -| | annotation is carried out. | | -+-----------------------------------+--------------------------------------------------+-----------------+ -| | ``annotate_unparsed_sentences`` | If True, sentences that cannot be parsed | ``True`` | -| | are annotated in their entirety as errors. | | -+-----------------------------------+--------------------------------------------------+-----------------+ -| | ``generate_suggestion_list`` | If True, annotations can in certain | ``False`` | -| | cases contain a list of possible corrections, | | -| | for the user to pick from. | | -+-----------------------------------+--------------------------------------------------+-----------------+ -| | ``suppress_suggestions`` | If True, more farfetched automatically | ``False`` | -| | suggested corrections are suppressed. | | -+-----------------------------------+--------------------------------------------------+-----------------+ -| | ``ignore_wordlist`` | The value is a set of strings to whitelist. | ``set()`` | -| | Each string is a word that should not be | | -| | marked as an error or corrected. The comparison | | -| | is case-sensitive. | | -+-----------------------------------+--------------------------------------------------+-----------------+ -| | ``one_sent`` | The input contains a single sentence only. | ``False`` | -| | Sentence splitting should not be attempted. | | -+-----------------------------------+--------------------------------------------------+-----------------+ -| | ``ignore_rules`` | A set of error codes that should be ignored | ``set()`` | -| | in the annotation process. | | -+-----------------------------------+--------------------------------------------------+-----------------+ -| | ``tov_config`` | Path to an additional configuration file that | ``False`` | -| | may be provided for correcting custom | | -| | tone-of-voice issues. | | -+-----------------------------------+--------------------------------------------------+-----------------+ - -An overview of error codes is available `here `__. - -************* -Prerequisites -************* - -GreynirCorrect runs on CPython 3.9 or newer, and on PyPy 3.9 or newer. It has -been tested on Linux, macOS and Windows. The -`PyPi package `_ -includes binary wheels for common environments, but if the setup on your OS -requires compilation from sources, you may need - -.. code-block:: bash - - $ sudo apt-get install python3-dev - -...or something to similar effect to enable this. - -************ -Installation -************ - -To install this package (assuming you have Python >= 3.9 with ``pip`` installed): - -.. code-block:: bash - - $ pip install reynir-correct - -If you want to be able to edit the source, do like so -(assuming you have ``git`` installed): - -.. code-block:: bash - - $ git clone https://github.com/mideind/GreynirCorrect - $ cd GreynirCorrect - $ # [ Activate your virtualenv here if you have one ] - $ pip install -e . - -The package source code is now in ``GreynirCorrect/src/reynir_correct``. - -********************* -The command line tool -********************* - -After installation, the corrector can be invoked directly from the command line: - -.. code-block:: bash - - $ correct input.txt output.txt - -...or: - -.. code-block:: bash - - $ echo "Þinngið samþikkti tilöguna" | correct - Þingið samþykkti tillöguna - -Input and output files are encoded in UTF-8. If the files are not -given explicitly, ``stdin`` and ``stdout`` are used for input and output, -respectively. - -Empty lines in the input are treated as sentence boundaries. - -By default, the output consists of one sentence per line, where each -line ends with a single newline character (ASCII LF, ``chr(10)``, ``"\n"``). -Within each line, tokens are separated by spaces. - -The following (mutually exclusive) options can be specified -on the command line: - -+-------------------+---------------------------------------------------+ -| | ``--csv`` | Output token objects in CSV | -| | format, one per line. Sentences are separated by | -| | lines containing ``0,"",""`` | -+-------------------+---------------------------------------------------+ -| | ``--json`` | Output token objects in JSON format, one per line.| -+-------------------+---------------------------------------------------+ -| | ``--normalize`` | Normalize punctuation, causing e.g. quotes to be | -| | output in Icelandic form and hyphens to be | -| | regularized. | -+-------------------+---------------------------------------------------+ -| | ``--grammar`` | Output whole-sentence annotations, including | -| | corrections and suggestions for spelling and | -| | grammar. Each sentence in the input is output as | -| | a text line containing a JSON object, terminated | -| | by a newline. | -+-------------------+---------------------------------------------------+ - -The CSV and JSON formats of token objects are identical to those documented -for the `Tokenizer package `__. - -The JSON format of whole-sentence annotations is identical to the one documented for -the `Yfirlestur.is HTTPS REST API `__. - -Type ``correct -h`` to get a short help message. - - -Command Line Examples ---------------------- - -.. code-block:: bash - - $ echo "Atvinuleysi jógst um 3%" | correct - Atvinnuleysi jókst um 3% - - -.. code-block:: bash - - $ echo "Barnið vil grænann lit" | correct --csv - 6,"Barnið","" - 6,"vil","" - 6,"grænan","" - 6,"lit","" - 0,"","" - - -Note how *vil* is not corrected, as it is a valid and common word, and -the ``correct`` command does not perform grammar checking by default. - - -.. code-block:: bash - - $ echo "Pakkin er fyrir hestin" | correct --json - {"k":"BEGIN SENT"} - {"k":"WORD","t":"Pakkinn"} - {"k":"WORD","t":"er"} - {"k":"WORD","t":"fyrir"} - {"k":"WORD","t":"hestinn"} - {"k":"END SENT"} - -To perform whole-sentence grammar checking and annotation as well as spell checking, -use the ``--grammar`` option: - - -.. code-block:: bash - - $ echo "Ég kláraði verkefnið þrátt fyrir að ég var þreittur." | correct --grammar - { - "original":"Ég kláraði verkefnið þrátt fyrir að ég var þreittur.", - "corrected":"Ég kláraði verkefnið þrátt fyrir að ég var þreyttur.", - "tokens":[ - {"k":6,"x":"Ég","o":"Ég"}, - {"k":6,"x":"kláraði","o":" kláraði"}, - {"k":6,"x":"verkefnið","o":" verkefnið"}, - {"k":6,"x":"þrátt fyrir","o":" þrátt fyrir"}, - {"k":6,"x":"að","o":" að"}, - {"k":6,"x":"ég","o":" ég"}, - {"k":6,"x":"var","o":" var"}, - {"k":6,"x":"þreyttur","o":" þreittur"}, - {"k":1,"x":".","o":"."} - ], - "annotations":[ - { - "start":6, - "end":6, - "start_char":35, - "end_char":37, - "code":"P_MOOD_ACK", - "text":"Hér er réttara að nota viðtengingarhátt - sagnarinnar 'vera', þ.e. 'væri'.", - "detail":"Í viðurkenningarsetningum á borð við 'Z' - í dæminu 'X gerði Y þrátt fyrir að Z' á sögnin að vera - í viðtengingarhætti fremur en framsöguhætti.", - "suggest":"væri" - }, - { - "start":7, - "end":7, - "start_char":38, - "end_char":41, - "code":"S004", - "text":"Orðið 'þreittur' var leiðrétt í 'þreyttur'", - "detail":"", - "suggest":"þreyttur" - } - ] - } - - -The output has been formatted for legibility - each input sentence is actually -represented by a JSON object in a single line of text, terminated by newline. - -Note that the ``corrected`` field only includes token-level spelling correction -(in this case *þreittur* ``->`` *þreyttur*), but no grammar corrections. -The grammar corrections are found in the ``annotations`` list. -To apply corrections and suggestions from the annotations, -replace source text or tokens (as identified by the ``start`` and ``end``, -or ``start_char`` and ``end_char`` properties) with the ``suggest`` field, if present. - -***** -Tests -***** - -To run the built-in tests, install `pytest `_, -``cd`` to your ``GreynirCorrect`` subdirectory (and optionally activate your -virtualenv), then run: - -.. code-block:: bash - - $ python -m pytest - -**************** -Acknowledgements -**************** - -Parts of this software are developed under the auspices of the -Icelandic Government's 5-year Language Technology Programme for Icelandic, -which is managed by Almannarómur and described -`here `__ -(English version `here `__). - -********************* -Copyright and License -********************* - -.. image:: https://github.com/mideind/GreynirPackage/raw/master/doc/_static/MideindLogoVert100.png?raw=true - :target: https://mideind.is - :align: right - :alt: Miðeind ehf. - -**Copyright © 2018-2025 Miðeind ehf.** - -GreynirCorrect's original author is *Vilhjálmur Þorsteinsson*. - -This software is licensed under the *MIT License*: - - *Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation - files (the "Software"), to deal in the Software without restriction, - including without limitation the rights to use, copy, modify, merge, - publish, distribute, sublicense, and/or sell copies of the Software, - and to permit persons to whom the Software is furnished to do so, - subject to the following conditions:* - - *The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software.* - - *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.* - ----- - -GreynirCorrect indirectly embeds the `Database of Icelandic Morphology `_ -(`Beygingarlýsing íslensks nútímamáls `_), abbreviated BÍN, -along with directly using -`Ritmyndir `_, -a collection of non-standard word forms. -Miðeind does not claim any endorsement by the BÍN authors or copyright holders. - -The BÍN source data are publicly available under the -`CC BY-SA 4.0 license `_, as further -detailed `here in English `_ -and `here in Icelandic `_. - -In accordance with the BÍN license terms, credit is hereby given as follows: - -*Beygingarlýsing íslensks nútímamáls. Stofnun Árna Magnússonar í íslenskum fræðum.* -*Höfundur og ritstjóri Kristín Bjarnadóttir.* diff --git a/pyproject.toml b/pyproject.toml index dcbea62..730738c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,6 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", - "Programming Language :: Python :: 3.14", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", "Topic :: Software Development :: Libraries :: Python Modules",