diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d3716aa6..3710b631 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,5 +1,13 @@ name: main -on: [push] +on: + push: + branches: + - master + pull_request: + types: + - opened + - reopened + - synchronize jobs: build-and-test: strategy: @@ -18,9 +26,8 @@ jobs: - name: conda env run: | - wget -O Mambaforge.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh" - curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh" - bash Mambaforge.sh -b -p "${HOME}/conda" + wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" + bash Miniforge3.sh -b -p "${HOME}/conda" source "${HOME}/conda/etc/profile.d/conda.sh" source "${HOME}/conda/etc/profile.d/mamba.sh" which conda @@ -102,7 +109,7 @@ jobs: - name: push artifact if: ${{ (matrix.python-version == 3.9) }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: doc path: /tmp/docs diff --git a/.gitignore b/.gitignore index 29a95722..335fa300 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +env/ *.swo *gfffeature.so *.swp diff --git a/doc/source/api.rst b/doc/source/api.rst index 2f9adefe..309a689a 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -117,10 +117,10 @@ Integration with other tools :toctree: autodocs :nosignatures: - gffutils.biopython_integration.to_seqfeature - gffutils.biopython_integration.from_seqfeature - gffutils.pybedtools_integration.tsses - gffutils.pybedtools_integration.to_bedtool + biopython_integration.to_seqfeature + biopython_integration.from_seqfeature + pybedtools_integration.tsses + pybedtools_integration.to_bedtool @@ -131,10 +131,10 @@ Utilities :toctree: autodocs :nosignatures: - gffutils.helpers.asinterval - gffutils.helpers.merge_attributes - gffutils.helpers.sanitize_gff_db - gffutils.helpers.annotate_gff_db - gffutils.helpers.infer_dialect - gffutils.helpers.example_filename - gffutils.inspect.inspect + helpers.asinterval + helpers.merge_attributes + helpers.sanitize_gff_db + helpers.annotate_gff_db + helpers.infer_dialect + helpers.example_filename + inspect.inspect diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst index f1f7545b..8ec34b22 100644 --- a/doc/source/changelog.rst +++ b/doc/source/changelog.rst @@ -3,6 +3,35 @@ Change log ========== + +v0.14 +----- + +- If a value contained a semicolon there would be unexpected behavior (reported + in `#212 `__). This is solved + by adding a new entry to the dialect, ``semicolon in quotes```, and running + the necessary regular expression only -- thanks to @DevangThakkar for the + fix. +- Refactored the attributes parsing to make it clearer to follow along, and + added more tests. The refactoring fixed some subtle bugs on corner cases: + - Previously, for features with repeated keys, the ``order`` key of dialects + would list the repeated keys each time which could result in undetermined + behavior. The ``order`` key is now unique and only the first occurrence of + a repeated key will be added to the order. + - Previously, the ``ensembl_gtf.txt`` example file had a leading *space* in + front of the attributes. This looks to be an error in the creation of the + example file in the first place, but had previously parsed fine. Now the + parser (correctly) mis-handles it. Since I'm unaware of any cases in the + wild that have a leading space, I actually consider the new parsing to be + more correct. + - Added tests to directly inspect the inferred dialects for the test cases. +- CI, testing, and docs infrastructure updates (miniforge instead of + mambaforge; GitHub Action version bumps; skip biopython test if it's not + installed; reduce build errors for docs) +- Fix `#224 `__), which was cause + by changes to the ``argh`` package used for the command-line tool. + + v0.13 ----- diff --git a/doc/source/conf.py b/doc/source/conf.py index c65c4a28..2b85647c 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -53,5 +53,3 @@ templates_path = ['_templates'] exclude_patterns = [] html_theme = 'sphinx_rtd_theme' -html_static_path = ['_static'] -html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] diff --git a/doc/source/dialect.rst b/doc/source/dialect.rst index b02d7c6a..aed018b9 100644 --- a/doc/source/dialect.rst +++ b/doc/source/dialect.rst @@ -38,7 +38,8 @@ A GTF dialect might look like this:: 'multival separator': ',', 'quoted GFF2 values': True, 'repeated keys': False, - 'trailing semicolon': True} + 'trailing semicolon': True, + 'semicolon_in_quotes': False} In contrast, a GFF dialect might look like this:: @@ -49,7 +50,9 @@ In contrast, a GFF dialect might look like this:: 'multival separator': ',', 'quoted GFF2 values': False, 'repeated keys': False, - 'trailing semicolon': False} + 'trailing semicolon': False, + 'semicolon_in_quotes': False} + As other real-world files are brought to the attention of the developers, it's likely that more entries will be added to the dialect. diff --git a/doc/source/examples.rst b/doc/source/examples.rst index 6b631236..54d8e45a 100644 --- a/doc/source/examples.rst +++ b/doc/source/examples.rst @@ -235,7 +235,7 @@ data upon import into the database: ... return x -Now we can supply this tranform function to :func:`create_db`: +Now we can supply this transform function to :func:`create_db`: >>> fn = gffutils.example_filename('ensembl_gtf.txt') >>> db = gffutils.create_db(fn, ":memory:", @@ -643,8 +643,8 @@ attributes to have the same format. To help with this, we can use the >>> dialect = helpers.infer_dialect( ... 'Transcript "B0019.1" ; WormPep "WP:CE40797" ; Note "amx-2" ; Prediction_status "Partially_confirmed" ; Gene "WBGene00000138" ; CDS "B0019.1" ; WormPep "WP:CE40797" ; Note "amx-2" ; Prediction_status "Partially_confirmed" ; Gene "WBGene00000138"', ... ) ->>> print(dialect) -{'leading semicolon': False, 'trailing semicolon': False, 'quoted GFF2 values': True, 'field separator': ' ; ', 'keyval separator': ' ', 'multival separator': ',', 'fmt': 'gtf', 'repeated keys': True, 'order': ['Transcript', 'WormPep', 'Note', 'Prediction_status', 'Gene', 'CDS', 'WormPep', 'Note', 'Prediction_status', 'Gene']} +>>> print({k: v for k, v in sorted(dialect.items())}) +{'field separator': ' ; ', 'fmt': 'gtf', 'keyval separator': ' ', 'leading semicolon': False, 'multival separator': ',', 'order': ['Transcript', 'WormPep', 'Note', 'Prediction_status', 'Gene', 'CDS'], 'quoted GFF2 values': True, 'repeated keys': True, 'semicolon in quotes': False, 'trailing semicolon': False} >>> db.dialect = dialect diff --git a/gffutils/constants.py b/gffutils/constants.py index 901e7146..2543e64d 100644 --- a/gffutils/constants.py +++ b/gffutils/constants.py @@ -127,6 +127,12 @@ # vs # ID=001; Name=gene1 "field separator": ";", + # Sometimes there are semicolons inside quotes that break things, e.g., + # + # note "Evidence 1a: Function1, Function2" + # vs + # note "Evidence 1a: Function; PubMedId: 123, 456" + "semicolon in quotes": False, # Usually "=" for GFF3; " " for GTF, e.g., # # gene_id "GENE1" diff --git a/gffutils/interface.py b/gffutils/interface.py index 9216cd39..ee9ec743 100644 --- a/gffutils/interface.py +++ b/gffutils/interface.py @@ -1285,7 +1285,7 @@ def create_introns( with open('tmp.gtf', 'w') as fout: for intron in db.create_introns(**intron_kwargs): - fout.write(str(intron) + "\n") + fout.write(str(intron) + "\\n") db.update(gffutils.DataIterator('tmp.gtf'), **create_kwargs) """ diff --git a/gffutils/parser.py b/gffutils/parser.py index 058423ad..f18a269a 100644 --- a/gffutils/parser.py +++ b/gffutils/parser.py @@ -1,9 +1,8 @@ # Portions copied over from BCBio.GFF.GFFParser import re -import copy import collections -import urllib +from urllib import parse from gffutils import constants from gffutils.exceptions import AttributeStringError @@ -16,8 +15,31 @@ ch.setFormatter(formatter) logger.addHandler(ch) +# Compile regexs up front gff3_kw_pat = re.compile(r"\w+=") +# Regex for each separator that will be tested +quoted_semicolon_patterns = dict() + +for sep in (" ; ", "; ", ";"): + quoted_semicolon_patterns[sep] = re.compile( + rf""" + {re.escape(sep)} # The separator we're considering (escaped for VERBOSE mode) + (?= # Positive lookahead: does remaining content match? + (?: # Start non-capturing group + [^"] # Either: match any character that is NOT a quote + | # OR + "[^"]*" # Match a complete quoted string, specifically: + # - opening quote ", followed by + # - zero or more non-quote characters [^"]* + # - followed by closing quote " + )* # Repeat the above pattern zero or more times + $ # Until we reach the end of the string + ) # End of lookahead + """, + re.VERBOSE, + ) + # Encoding/decoding notes # ----------------------- # From @@ -50,9 +72,9 @@ # # See also issue #98. # -# Note that spaces are NOT encoded. Some GFF files have spaces encoded; in -# these cases round-trip invariance will not hold since the %20 will be decoded -# but not re-encoded. +# Note that spaces are NOT supposed to be encoded. Yet some GFF files have +# spaces encoded anyway; in these cases round-trip invariance will not hold +# since the %20 will be decoded but not re-encoded. _to_quote = "\n\t\r%;=&," _to_quote += "".join([chr(i) for i in range(32)]) _to_quote += chr(127) @@ -74,6 +96,208 @@ def __missing__(self, b): quoter = Quoter() +def _split_keyvals(keyval_str, dialect=None): + """ + Dialect detection requires partially parsing the attributes. + """ + from gffutils import feature + + quals = feature.dict_class() + + if not keyval_str: + return quals, dialect + + infer_dialect = False + if dialect is None: + infer_dialect = True + dialect = {} + + # No known cases yet of different multival separator + dialect["multival separator"] = "," + + # Detection for these dialect fields can work on the full attribute + # string. Other detection needs to wait until we've further parsed the + # attributes. + if infer_dialect: + dialect["trailing semicolon"] = keyval_str[-1] == ";" + dialect["leading semicolon"] = keyval_str[0] == ";" + semicolon_in_quotes = False + sep = None + for sep in (" ; ", "; ", ";"): + parts = keyval_str.split(sep) + if len(parts) > 1: + # If naive split differs from more expensive regex, we infer there was + # a semicolon within quoted value and we'll have to use the expensive + # method later + parts_regex = re.split(quoted_semicolon_patterns[sep], keyval_str) + if parts != parts_regex: + semicolon_in_quotes = True + break + dialect["semicolon in quotes"] = semicolon_in_quotes + dialect["field separator"] = sep + + if dialect["trailing semicolon"]: + keyval_str = keyval_str.rstrip(";") + + if dialect["leading semicolon"]: + keyval_str = keyval_str.lstrip(";") + + if dialect["semicolon in quotes"]: + parts = re.split( + quoted_semicolon_patterns[dialect["field separator"]], keyval_str + ) + else: + parts = keyval_str.split(dialect["field separator"]) + + # The next stage of dialect inference works on the 'parts' like: + # + # parts = ["ID=001", "Name=gene1"] + # + if infer_dialect: + dialect["fmt"] = "gff3" + if gff3_kw_pat.match(parts[0]): + dialect["fmt"] = "gff3" + dialect["keyval separator"] = "=" + else: + dialect["fmt"] = "gtf" + dialect["keyval separator"] = " " + + # Now we split + # + # parts = ["ID=001", "Name=gene1"] + # + # into + # + # key_val_tuples = [("ID", "001"), ("Name", "gene1")] + # + # in a dialect-dependent manner. + kvsep = dialect["keyval separator"] + key_val_tuples = [p.split(kvsep) for p in parts] + + # With the split keys we can detect whether any are repeated + if infer_dialect: + keys = [i[0] for i in key_val_tuples] + dialect["repeated keys"] = len(keys) != len(set(keys)) + + # For dialect detection, this will help figure out if there is inconsistent + # quoting across values. + quoted_values = [] + + for i in key_val_tuples: + + if len(i) == 2: + # Easy, on-spec case + key, val = i + + elif len(i) == 1: + # By convention, no value becomes an empty string, e.g.: + # + # attributes = "ID=001;is_gene;" + key = i[0] + val = "" + + else: + # Multiple spaces within quoted values are joined back together without + # requiring a regex (like we need when there's *field* separator like + # a semicolon in the values) + # + # That is: + # + # attributes = 'gene_description "an important gene"; gene_id "g001"' + # + # becomes + # + # key_val_tuples = [("gene_description", "an", "important", "gene"), ("gene_id", "g001")] + # + # so here that first key/val pair will become: + # + # key = "gene_description" + # val = "an important gene" + # + # Another pathological case, this time for GFF3: + # + # Alias=SGN-M1347;ID=T0028;Note=marker name(s): T0028 SGN-M1347 |identity=99.58|escore=2e-126 + # ^ ^ + key = i[0] + val = kvsep.join(i[1:]) + + # By convention all values are lists, even if there's only one value + # (or even no values) + if key not in quals: + quals[key] = [] + + # This will run on every value. + if infer_dialect: + quoted = len(val) > 0 and val[0] == '"' and val[-1] == '"' + quoted_values.append(quoted) + dialect["quoted GFF2 values"] = quoted + + if dialect["quoted GFF2 values"] and val: + val = val.strip('"') + + if val: + # For repeated keys dialect, don't split on an internal comma. That is, + # + # attributes = 'db_xref="g01,g02"; db_xref="XYZ"' + # + # becomes: + # + # { + # "db_xref": ["g01,g02", "XYZ"] + # } + # + if dialect.get("repeated keys"): + quals[key].append(val) + + # Otherwise, split but only if it's a comma without a space. So: + # + # attributes = 'db_xref="g01,g02"' + # + # becomes + # { + # "db_xref": ["g01", "g02"] + # } + # but + # + # attributes = 'description="kinase, subunit 1"' + # ^ note the space here + # becomes + # { + # "description": ["kinase, subunit 1"] + # } + # + else: + # E.g. the "kinase, subunit 1" example above + if ", " in val: + quals[key].append(val) + else: + quals[key].extend(val.split(",")) + + # If there was inconsistent quoting, we fall back to "not quoted" so + # as to avoid incorrectly stripping off first and last quotes. + if infer_dialect and len(set(quoted_values)) > 1: + # Prior behavior was to use whatever the first value used + dialect["quoted GFF2 values"] = quoted_values[0] + + # Though there could be an argument for considering quotes in mixed + # cases to be part of the string, though technically they should be + # %-encoded if so. + # dialect["quoted GFF2 values"] = False + + # Handle unquoting of %-encoded values + if not constants.ignore_url_escape_characters and dialect["fmt"] == "gff3": + for key, vals in quals.items(): + unquoted = [parse.unquote(v) for v in vals] + quals[key] = unquoted + + # Now that we're not supporting old Python versions we can rely on dict + # insertion order + if infer_dialect: + dialect["order"] = list(quals.keys()) + + return quals, dialect + + def _reconstruct(keyvals, dialect, keep_order=False, sort_attribute_values=False): """ Reconstructs the original attributes string according to the dialect. @@ -156,6 +380,20 @@ def sort_key(x): part = key else: if dialect["fmt"] == "gtf": + # By convention, GTF attributes with no value are reconstructed + # with an empty string. E.g.: + # 'gene_id "gene1"; is_gene;' + # + # becomes + # + # { + # "gene_id": "gene1", + # "is_gene": "" + # } + # + # and is printed as: + # + # 'gene_id "gene1"; is_gene "";' part = dialect["keyval separator"].join([key, '""']) else: part = key @@ -169,207 +407,3 @@ def sort_key(x): parts_str += ";" return parts_str - - -# TODO: -# Cythonize -- profiling shows that the bulk of the time is spent on this -# function... -def _split_keyvals(keyval_str, dialect=None): - """ - Given the string attributes field of a GFF-like line, split it into an - attributes dictionary and a "dialect" dictionary which contains information - needed to reconstruct the original string. - - Lots of logic here to handle all the corner cases. - - If `dialect` is None, then do all the logic to infer a dialect from this - attribute string. - - Otherwise, use the provided dialect (and return it at the end). - """ - - def _unquote_quals(quals, dialect): - """ - Handles the unquoting (decoding) of percent-encoded characters. - - See notes on encoding/decoding above. - """ - if not constants.ignore_url_escape_characters and dialect["fmt"] == "gff3": - for key, vals in quals.items(): - unquoted = [urllib.parse.unquote(v) for v in vals] - quals[key] = unquoted - return quals - - infer_dialect = False - if dialect is None: - # Make a copy of default dialect so it can be modified as needed - dialect = copy.copy(constants.dialect) - infer_dialect = True - from gffutils import feature - - quals = feature.dict_class() - if not keyval_str: - return quals, dialect - - # If a dialect was provided, then use that directly. - if not infer_dialect: - if dialect["trailing semicolon"]: - keyval_str = keyval_str.rstrip(";") - - parts = keyval_str.split(dialect["field separator"]) - - kvsep = dialect["keyval separator"] - if dialect["leading semicolon"]: - pieces = [] - for p in parts: - if p and p[0] == ";": - p = p[1:] - pieces.append(p.strip().split(kvsep)) - key_vals = [(p[0], " ".join(p[1:])) for p in pieces] - - if dialect["fmt"] == "gff3": - key_vals = [p.split(kvsep) for p in parts] - else: - leadingsemicolon = dialect["leading semicolon"] - pieces = [] - for i, p in enumerate(parts): - if i == 0 and leadingsemicolon: - p = p[1:] - pieces.append(p.strip().split(kvsep)) - key_vals = [(p[0], " ".join(p[1:])) for p in pieces] - - quoted = dialect["quoted GFF2 values"] - for item in key_vals: - # Easy if it follows spec - if len(item) == 2: - key, val = item - - # Only key provided? - elif len(item) == 1: - key = item[0] - val = "" - - else: - key = item[0] - val = dialect["keyval separator"].join(item[1:]) - - try: - quals[key] - except KeyError: - quals[key] = [] - - if quoted: - if len(val) > 0 and val[0] == '"' and val[-1] == '"': - val = val[1:-1] - - if val: - # TODO: if there are extra commas for a value, just use empty - # strings - # quals[key].extend([v for v in val.split(',') if v]) - vals = val.split(",") - quals[key].extend(vals) - - quals = _unquote_quals(quals, dialect) - return quals, dialect - - # If we got here, then we need to infer the dialect.... - # - # Reset the order to an empty list so that it will only be populated with - # keys that are found in the file. - dialect["order"] = [] - - # ensembl GTF has trailing semicolon - if keyval_str[-1] == ";": - keyval_str = keyval_str[:-1] - dialect["trailing semicolon"] = True - - # GFF2/GTF has a semicolon with at least one space after it. - # Spaces can be on both sides (e.g. wormbase) - # GFF3 works with no spaces. - # So split on the first one we can recognize... - for sep in (" ; ", "; ", ";"): - parts = keyval_str.split(sep) - if len(parts) > 1: - dialect["field separator"] = sep - break - - # Is it GFF3? They have key-vals separated by "=" - if gff3_kw_pat.match(parts[0]): - key_vals = [p.split("=") for p in parts] - dialect["fmt"] = "gff3" - dialect["keyval separator"] = "=" - - # Otherwise, key-vals separated by space. Key is first item. - else: - dialect["keyval separator"] = " " - pieces = [] - for p in parts: - # Fix misplaced semicolons in keys in some GFF2 files - if p and p[0] == ";": - p = p[1:] - dialect["leading semicolon"] = True - pieces.append(p.strip().split(" ")) - key_vals = [(p[0], " ".join(p[1:])) for p in pieces] - - for item in key_vals: - - # Easy if it follows spec - if len(item) == 2: - key, val = item - - # Only key provided? - elif len(item) == 1: - key = item[0] - val = "" - - # Pathological cases where values of a key have within them the key-val - # separator, e.g., - # Alias=SGN-M1347;ID=T0028;Note=marker name(s): T0028 SGN-M1347 |identity=99.58|escore=2e-126 - # ^ ^ - else: - key = item[0] - val = dialect["keyval separator"].join(item[1:]) - - # Is the key already in there? - if key in quals: - dialect["repeated keys"] = True - else: - quals[key] = [] - - # Remove quotes in GFF2 - if len(val) > 0 and val[0] == '"' and val[-1] == '"': - val = val[1:-1] - dialect["quoted GFF2 values"] = True - if val: - - # TODO: if there are extra commas for a value, just use empty - # strings - # quals[key].extend([v for v in val.split(',') if v]) - - # See issue #198, where commas within a description can incorrectly - # cause the dialect inference to conclude that there are not - # repeated keys. - # - # More description in PR #208. - if dialect["repeated keys"]: - quals[key].append(val) - else: - vals = val.split(",") - - # If anything starts with a leading space, then we infer that - # it was part of a description or some other typographical - # interpretation, not a character to split multiple vals on -- - # and append the original val rather than the split vals. - if any([i[0] == " " for i in vals if i]): - quals[key].append(val) - else: - quals[key].extend(vals) - - # keep track of the order of keys - dialect["order"].append(key) - - if (dialect["keyval separator"] == " ") and (dialect["quoted GFF2 values"]): - dialect["fmt"] = "gtf" - - quals = _unquote_quals(quals, dialect) - return quals, dialect diff --git a/gffutils/pybedtools_integration.py b/gffutils/pybedtools_integration.py index 5c5c2b90..e01e4911 100644 --- a/gffutils/pybedtools_integration.py +++ b/gffutils/pybedtools_integration.py @@ -113,7 +113,7 @@ def tsses( if they overlap (as in the first two): - >>> print(tsses(db)) # doctest: +NORMALIZE_WHITESPACE + >>> print(gffutils.pybedtools_integration.tsses(db)) # doctest: +NORMALIZE_WHITESPACE chr2L gffutils_derived transcript_TSS 7529 7529 . + . gene_id "FBgn0031208"; transcript_id "FBtr0300689"; chr2L gffutils_derived transcript_TSS 7529 7529 . + . gene_id "FBgn0031208"; transcript_id "FBtr0300690"; chr2L gffutils_derived transcript_TSS 11000 11000 . - . gene_id "Fk_gene_1"; transcript_id "transcript_Fk_gene_1"; @@ -124,7 +124,7 @@ def tsses( Default merging, showing the first two TSSes merged and reported as a single unique TSS for the gene. Note the conversion to BED: - >>> x = tsses(db, merge_overlapping=True) + >>> x = gffutils.pybedtools_integration.tsses(db, merge_overlapping=True) >>> print(x) # doctest: +NORMALIZE_WHITESPACE chr2L 7528 7529 FBgn0031208 . + chr2L 10999 11000 Fk_gene_1 . - @@ -135,7 +135,7 @@ def tsses( be easier to parse than the original GTF or GFF file. With no merging specified, we must add `as_bed6=True` to see the names in BED format. - >>> x = tsses(db, attrs=['gene_id', 'transcript_id'], as_bed6=True) + >>> x = gffutils.pybedtools_integration.tsses(db, attrs=['gene_id', 'transcript_id'], as_bed6=True) >>> print(x) # doctest: +NORMALIZE_WHITESPACE chr2L 7528 7529 FBgn0031208:FBtr0300689 . + chr2L 7528 7529 FBgn0031208:FBtr0300690 . + @@ -145,7 +145,7 @@ def tsses( Use a 3kb merge distance so the last 2 features are merged together: - >>> x = tsses(db, merge_overlapping=True, merge_kwargs=dict(d=3000)) + >>> x = gffutils.pybedtools_integration.tsses(db, merge_overlapping=True, merge_kwargs=dict(d=3000)) >>> print(x) # doctest: +NORMALIZE_WHITESPACE chr2L 7528 7529 FBgn0031208 . + chr2L 10999 12500 Fk_gene_1,Fk_gene_2 . - @@ -154,7 +154,7 @@ def tsses( The set of unique TSSes for each gene, +1kb upstream and 500bp downstream: - >>> x = tsses(db, merge_overlapping=True) + >>> x = gffutils.pybedtools_integration.tsses(db, merge_overlapping=True) >>> x = x.slop(l=1000, r=500, s=True, genome='dm3') >>> print(x) # doctest: +NORMALIZE_WHITESPACE chr2L 6528 8029 FBgn0031208 . + diff --git a/gffutils/scripts/gffutils-cli b/gffutils/scripts/gffutils-cli index 051b76d5..70a882b5 100755 --- a/gffutils/scripts/gffutils-cli +++ b/gffutils/scripts/gffutils-cli @@ -76,7 +76,7 @@ def fetch(db, ids): (like grep -v)''') @arg('--exclude-self', help='''Use this to suppress reporting the IDs you've provided.''') -def children(db, ids, limit=None, exclude=None, exclude_self=False): +def children(db, ids, *, limit=None, exclude=None, exclude_self=False): """ Fetch children from the database according to ID. """ @@ -110,7 +110,7 @@ def children(db, ids, limit=None, exclude=None, exclude_self=False): (like grep -v)''') @arg('--exclude-self', help='''Use this to suppress reporting the IDs you've provided.''') -def parents(db, ids, limit=None, exclude=None, exclude_self=False): +def parents(db, ids, *, limit=None, exclude=None, exclude_self=False): """ Fetch parents from the database according to ID. """ @@ -167,7 +167,7 @@ def common(db): @arg('--disable-infer-transcripts', help='''Disable inferring of transcript extents for GTF files. Use this if your GTF file already has "transcript" featuretypes''') -def create(filename, output=None, force=False, quiet=False, merge="merge", +def create(filename, *, output=None, force=False, quiet=False, merge="merge", disable_infer_genes=False, disable_infer_transcripts=False): """ Create a database. @@ -198,7 +198,7 @@ def clean(filename): @arg('--in-place', help='''Sanitize file in-place: overwrites current file with sanitized version.''') -def sanitize(filename, +def sanitize(filename, *, in_memory=True, in_place=False): """ @@ -225,7 +225,7 @@ def sanitize(filename, @arg('filename', help='''GFF or GTF file to use.''') @arg('--in-place', help='''Remove duplicates in place (overwrite current file.)''') -def rmdups(filename, in_place=False): +def rmdups(filename, *, in_place=False): """ Remove duplicates from a GFF file. """ @@ -278,7 +278,7 @@ def convert(filename): @arg('--featuretype', help='''Restrict to a particular featuretype. This can be faster than doing a grep on the output, since it restricts the search space in the database''') -def search(db, text, featuretype=None): +def search(db, text, *, featuretype=None): """ Search the attributes. """ diff --git a/gffutils/test/attr_test_cases.py b/gffutils/test/attr_test_cases.py index b08afbe3..9ee1de96 100644 --- a/gffutils/test/attr_test_cases.py +++ b/gffutils/test/attr_test_cases.py @@ -15,36 +15,130 @@ """ + attrs = [ + dict( + str="ID=001;Name=gene1", + attrs={ + "ID": ["001"], + "Name": ["gene1"], + }, + ok=None, + dialect_mods={"order": ["ID", "Name"]}, + ), + dict( + str="ID=001;Name=gene1;", + attrs={ + "ID": ["001"], + "Name": ["gene1"], + }, + ok=None, + dialect_mods={"trailing semicolon": True, "order": ["ID", "Name"]}, + ), + dict( + str="ID=001; Name=gene1;", + attrs={ + "ID": ["001"], + "Name": ["gene1"], + }, + ok=None, + dialect_mods={ + "trailing semicolon": True, + "field separator": "; ", + "order": ["ID", "Name"], + }, + ), + dict( + str='ID="001"', + attrs={"ID": ["001"]}, + ok=None, + dialect_mods={ + "quoted GFF2 values": True, + "order": ["ID"], + }, + ), + dict( + str='ID="001"; Name="gene1"; types="a,b,c"', + attrs={"ID": ["001"], "Name": ["gene1"], "types": ["a", "b", "c"]}, + ok=None, + dialect_mods={ + "quoted GFF2 values": True, + "field separator": "; ", + "order": ["ID", "Name", "types"], + }, + ), + dict( + str='ID="001"; Name="gene1"; types="a"; types="b"; types="c"', + attrs={"ID": ["001"], "Name": ["gene1"], "types": ["a", "b", "c"]}, + ok=None, + dialect_mods={ + "quoted GFF2 values": True, + "field separator": "; ", + "repeated keys": True, + "order": ["ID", "Name", "types"], + }, + ), + dict( + str="Name=gene1;ID=001", + attrs={"Name": ["gene1"], "ID": ["001"]}, + ok=None, + dialect_mods={"order": ["Name", "ID"]}, + ), + dict( + str='gene_id "001";gene_name "gene1"', + attrs={"gene_id": ["001"], "gene_name": ["gene1"]}, + ok=None, + dialect_mods={ + "fmt": "gtf", + "quoted GFF2 values": True, + "keyval separator": " ", + "order": ["gene_id", "gene_name"], + }, + ), # c_elegans_WS199_shortened_gff.txt - ( - "count=1;gene=amx-2;sequence=SAGE:ggcagagtcttttggca;" "transcript=B0019.1", - { + dict( + str="count=1;gene=amx-2;sequence=SAGE:ggcagagtcttttggca;transcript=B0019.1", + attrs={ "count": ["1"], "gene": ["amx-2"], "sequence": ["SAGE:ggcagagtcttttggca"], "transcript": ["B0019.1"], }, - None, + ok=None, + dialect_mods={"order": ["count", "gene", "sequence", "transcript"]}, ), # ensembl_gtf.txt - ( - 'gene_id "Y74C9A.6"; transcript_id "Y74C9A.6"; exon_number "1"; ' - 'gene_name "Y74C9A.6"; transcript_name "NR_001477.2";', - { + dict( + str=( + 'gene_id "Y74C9A.6"; transcript_id "Y74C9A.6"; exon_number "1"; gene_name "Y74C9A.6"; transcript_name "NR_001477.2";' + ), + attrs={ "gene_id": ["Y74C9A.6"], "transcript_id": ["Y74C9A.6"], "exon_number": ["1"], "gene_name": ["Y74C9A.6"], "transcript_name": ["NR_001477.2"], }, - None, + ok=None, + dialect_mods={ + "trailing semicolon": True, + "fmt": "gtf", + "keyval separator": " ", + "field separator": "; ", + "quoted GFF2 values": True, + "order": [ + "gene_id", + "transcript_id", + "exon_number", + "gene_name", + "transcript_name", + ], + }, ), # F3-unique-3.v2.gff - ( - "g=A3233312322232122211;i=1;p=1.000;q=23,12,18,17,10,24,19,14,27,9,23" - ",9,16,20,11,7,8,4,4,14;u=0,0,0,1", - { + dict( + str="g=A3233312322232122211;i=1;p=1.000;q=23,12,18,17,10,24,19,14,27,9,23,9,16,20,11,7,8,4,4,14;u=0,0,0,1", + attrs={ "g": ["A3233312322232122211"], "i": ["1"], "p": ["1.000"], @@ -72,20 +166,27 @@ ], "u": ["0", "0", "0", "1"], }, - None, + ok=None, + dialect_mods={"order": ["g", "i", "p", "q", "u"]}, ), # glimmer_nokeyval.gff3 - ( - "ID=GL0000006;Name=GL0000006;Lack 3'-end;", - {"ID": ["GL0000006"], "Name": ["GL0000006"], "Lack 3'-end": []}, - None, + dict( + str="ID=GL0000006;Name=GL0000006;Lack 3'-end;", + attrs={"ID": ["GL0000006"], "Name": ["GL0000006"], "Lack 3'-end": []}, + ok=None, + dialect_mods={ + "order": ["ID", "Name", "Lack 3'-end"], + "trailing semicolon": True, + }, ), # hybrid1.gff3 - ( - "ID=A00469;Dbxref=AFFX-U133:205840_x_at,Locuslink:2688,Genbank-mRNA:" - "A00469,Swissprot:P01241,PFAM:PF00103,AFFX-U95:1332_f_at,Swissprot:" - "SOMA_HUMAN;Note=growth%20hormone%201;Alias=GH1", - { + dict( + str=( + "ID=A00469;Dbxref=AFFX-U133:205840_x_at,Locuslink:2688,Genbank-mRNA:" + "A00469,Swissprot:P01241,PFAM:PF00103,AFFX-U95:1332_f_at,Swissprot:" + "SOMA_HUMAN;Note=growth%20hormone%201;Alias=GH1" + ), + attrs={ "ID": ["A00469"], "Dbxref": [ "AFFX-U133:205840_x_at", @@ -99,9 +200,10 @@ "Note": ["growth hormone 1"], "Alias": ["GH1"], }, - "ID=A00469;Dbxref=AFFX-U133:205840_x_at,Locuslink:2688,Genbank-mRNA:" + ok="ID=A00469;Dbxref=AFFX-U133:205840_x_at,Locuslink:2688,Genbank-mRNA:" "A00469,Swissprot:P01241,PFAM:PF00103,AFFX-U95:1332_f_at,Swissprot:" "SOMA_HUMAN;Note=growth hormone 1;Alias=GH1", + dialect_mods={"order": ["ID", "Dbxref", "Note", "Alias"]}, ), # jgi_gff2.txt # @@ -109,19 +211,30 @@ # quoted but string values are. Only way to make this be invariant is to # keep track of the "flavor" of each attribute; not sure it's worth the # effort / processing time. - ( - 'name "fgenesh1_pg.C_chr_1000007"; transcriptId 873', - {"name": ["fgenesh1_pg.C_chr_1000007"], "transcriptId": ["873"]}, - 'name "fgenesh1_pg.C_chr_1000007"; transcriptId "873"', + dict( + str='name "fgenesh1_pg.C_chr_1000007"; transcriptId 873', + attrs={"name": ["fgenesh1_pg.C_chr_1000007"], "transcriptId": ["873"]}, + ok='name "fgenesh1_pg.C_chr_1000007"; transcriptId "873"', + dialect_mods={ + "order": ["name", "transcriptId"], + "quoted GFF2 values": True, + "keyval separator": " ", + "fmt": "gtf", + "field separator": "; ", + }, ), # mouse_extra_comma.gff3: extra comma line # # Note extra empty string in the dictionary's "Parent" field. # - ( - "Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1,", - {"Name": ["CDS:NC_000083.5:LOC100040603"], "Parent": ["XM_001475631.1", ""]}, - None, + dict( + str="Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1,", + attrs={ + "Name": ["CDS:NC_000083.5:LOC100040603"], + "Parent": ["XM_001475631.1", ""], + }, + ok=None, + dialect_mods={"order": ["Name", "Parent"]}, ), # mouse_extra_comma.gff3 # @@ -135,20 +248,23 @@ # # In both cases, the dictionary entry is simply an empty list; it's just in # the reconstruction where things get tricky. - ( - "ID=;Parent=XM_001475631.1", - {"ID": [], "Parent": ["XM_001475631.1"]}, - "ID;Parent=XM_001475631.1", + dict( + str="ID=;Parent=XM_001475631.1", + attrs={"ID": [], "Parent": ["XM_001475631.1"]}, + ok="ID;Parent=XM_001475631.1", + dialect_mods={"order": ["ID", "Parent"]}, ), # ncbi_gff3.txt - ( - "ID=NC_008596.1:speB:unknown_transcript_1;Parent=NC_008596.1:speB;" - "locus_tag=MSMEG_1072;EC_number=3.5.3.11;note=identified%20by%20mat" - "ch%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20prote" - "in%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;p" - "rotein_id=YP_885468.1;db_xref=GI:118469242;db_xref=GeneID:4535378;" - "exon_number=1", - { + dict( + str=( + "ID=NC_008596.1:speB:unknown_transcript_1;Parent=NC_008596.1:speB;" + "locus_tag=MSMEG_1072;EC_number=3.5.3.11;note=identified%20by%20mat" + "ch%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20prote" + "in%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;p" + "rotein_id=YP_885468.1;db_xref=GI:118469242;db_xref=GeneID:4535378;" + "exon_number=1" + ), + attrs={ "ID": ["NC_008596.1:speB:unknown_transcript_1"], "Parent": ["NC_008596.1:speB"], "locus_tag": ["MSMEG_1072"], @@ -164,18 +280,39 @@ "db_xref": ["GI:118469242", "GeneID:4535378"], "exon_number": ["1"], }, - "ID=NC_008596.1:speB:unknown_transcript_1;Parent=NC_008596.1:speB;" + ok="ID=NC_008596.1:speB:unknown_transcript_1;Parent=NC_008596.1:speB;" "locus_tag=MSMEG_1072;EC_number=3.5.3.11;note=identified by mat" "ch to protein family HMM PF00491%3B match to prote" "in family HMM TIGR01230;transl_table=11;product=agmatinase;p" "rotein_id=YP_885468.1;db_xref=GI:118469242;db_xref=GeneID:4535378;" "exon_number=1", + dialect_mods={ + "order": [ + "ID", + "Parent", + "locus_tag", + "EC_number", + "note", + "transl_table", + "product", + "protein_id", + "db_xref", + "exon_number", + ], + "repeated keys": True, + }, ), # wormbase_gff2_alt.txt # - ( - 'CDS "cr01.sctg102.wum.2.1"', - {"CDS": ["cr01.sctg102.wum.2.1"]}, - None, + dict( + str='CDS "cr01.sctg102.wum.2.1"', + attrs={"CDS": ["cr01.sctg102.wum.2.1"]}, + ok=None, + dialect_mods={ + "order": ["CDS"], + "quoted GFF2 values": True, + "keyval separator": " ", + "fmt": "gtf", + }, ), ] diff --git a/gffutils/test/data/ensembl_gtf.txt b/gffutils/test/data/ensembl_gtf.txt index f54f8fdd..88de6d51 100644 --- a/gffutils/test/data/ensembl_gtf.txt +++ b/gffutils/test/data/ensembl_gtf.txt @@ -1,33 +1,33 @@ -I snoRNA exon 3747 3909 . - . gene_id "Y74C9A.6"; transcript_id "Y74C9A.6"; exon_number "1"; gene_name "Y74C9A.6"; transcript_name "NR_001477.2"; -I protein_coding exon 12764812 12764949 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "1"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12764812 12764937 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "1"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding start_codon 12764935 12764937 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "1"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding exon 12764291 12764471 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "2"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12764291 12764471 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "2"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12763979 12764102 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "3"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12763979 12764102 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "3"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12763729 12763882 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "4"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12763729 12763882 . - 1 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "4"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12763448 12763655 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "5"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12763448 12763655 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "5"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12763112 12763249 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "6"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12763112 12763249 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "6"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12762648 12762806 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "7"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12762648 12762806 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "7"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12762127 12762268 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "8"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12762127 12762268 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "8"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12761799 12761953 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "9"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12761799 12761953 . - 1 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "9"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12761172 12761516 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "10"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12761172 12761516 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "10"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12760834 12760904 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "11"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12760834 12760904 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "11"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12760365 12760494 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "12"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12760365 12760494 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "12"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12760227 12760319 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "13"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12760227 12760319 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "13"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12759949 12760013 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "14"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12759949 12760013 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "14"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding exon 12759579 12759828 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "15"; gene_name "amx-2"; transcript_name "B0019.1"; -I protein_coding CDS 12759748 12759828 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "15"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; -I protein_coding stop_codon 12759745 12759747 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "15"; gene_name "amx-2"; transcript_name "B0019.1"; +I snoRNA exon 3747 3909 . - . gene_id "Y74C9A.6"; transcript_id "Y74C9A.6"; exon_number "1"; gene_name "Y74C9A.6"; transcript_name "NR_001477.2"; +I protein_coding exon 12764812 12764949 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "1"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12764812 12764937 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "1"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding start_codon 12764935 12764937 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "1"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding exon 12764291 12764471 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "2"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12764291 12764471 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "2"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12763979 12764102 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "3"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12763979 12764102 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "3"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12763729 12763882 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "4"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12763729 12763882 . - 1 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "4"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12763448 12763655 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "5"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12763448 12763655 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "5"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12763112 12763249 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "6"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12763112 12763249 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "6"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12762648 12762806 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "7"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12762648 12762806 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "7"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12762127 12762268 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "8"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12762127 12762268 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "8"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12761799 12761953 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "9"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12761799 12761953 . - 1 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "9"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12761172 12761516 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "10"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12761172 12761516 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "10"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12760834 12760904 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "11"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12760834 12760904 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "11"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12760365 12760494 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "12"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12760365 12760494 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "12"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12760227 12760319 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "13"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12760227 12760319 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "13"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12759949 12760013 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "14"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12759949 12760013 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "14"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding exon 12759579 12759828 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "15"; gene_name "amx-2"; transcript_name "B0019.1"; +I protein_coding CDS 12759748 12759828 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "15"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1"; +I protein_coding stop_codon 12759745 12759747 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "15"; gene_name "amx-2"; transcript_name "B0019.1"; diff --git a/gffutils/test/parser_test.py b/gffutils/test/parser_test.py index ed578a7e..4f510287 100644 --- a/gffutils/test/parser_test.py +++ b/gffutils/test/parser_test.py @@ -53,17 +53,32 @@ def test_attrs_OK(item): (see attr_test_cases.py for details); `acceptable_reconstruction` handles those. """ - attr_str, attr_dict, acceptable_reconstruction = item - result, dialect = parser._split_keyvals(attr_str) + attr_str = item["str"] + attr_dict = item["attrs"] + acceptable_reconstruction = item["ok"] + dialect_mods = item["dialect_mods"] + + result, inferred_dialect = parser._split_keyvals(attr_str) result = dict(result) assert result == attr_dict, result - reconstructed = parser._reconstruct(result, dialect, keep_order=True) + reconstructed = parser._reconstruct(result, inferred_dialect, keep_order=True) if acceptable_reconstruction: assert reconstructed == acceptable_reconstruction, reconstructed else: assert reconstructed == attr_str, reconstructed + # Get the default dialect for comparison, and update it with any diffs + # indicated by the test case + default_dialect = constants.dialect.copy() + default_dialect.update(dialect_mods) + + print(inferred_dialect) + print(dialect_mods) + print(attr_str) + assert default_dialect == inferred_dialect + + def parser_smoke_test(): """ @@ -75,7 +90,7 @@ def parser_smoke_test(): parser.logger.setLevel(logging.CRITICAL) for filename in TEST_FILENAMES: p = iterators._FileIterator(filename) - for i in p: + for _ in p: continue @@ -93,7 +108,8 @@ def test_empty_recontruct(): def test_empty_split_keyvals(): attrs, dialect = parser._split_keyvals(keyval_str=None) assert attrs == feature.dict_class() - assert dialect == constants.dialect + # assert dialect == constants.dialect + assert dialect is None def test_repeated_keys_conflict(): diff --git a/gffutils/test/test_1.py b/gffutils/test/test_1.py index 2b88cc04..1780d996 100644 --- a/gffutils/test/test_1.py +++ b/gffutils/test/test_1.py @@ -14,10 +14,7 @@ import tempfile import http.server as SimpleHTTPServer -if sys.version_info.major == 3: - import socketserver as SocketServer -else: - import SocketServer +import socketserver as SocketServer import multiprocessing import json @@ -482,58 +479,51 @@ def test_sanitize_gff(): print("Sanitized GFF successfully.") -def test_region(): - +@pytest.mark.parametrize("kwargs,expected", [ + # previously failed, see issue #45 + (dict(seqid="chr2L", start=1, end=2e9, completely_within=True), 27), + (dict(region="chr2L", start=0), ValueError), + (dict(region="chr2L", end=0), ValueError), + (dict(region="chr2L", seqid=0), ValueError), + # these coords should catch everything + (dict(region="chr2L:7529-12500"), 27), + # stranded versions: + (dict(region="chr2L:7529-12500", strand="."), 0), + (dict(region="chr2L:7529-12500", strand="+"), 21), + (dict(region="chr2L:7529-12500", strand="-"), 6), + # different ways of selecting only that last exon in the last gene: + (dict(seqid="chr2L", start=11500, featuretype="exon"), 1), + (dict(seqid="chr2L", start=9500, featuretype="exon", strand="+"), 1), + # alternative method + (dict(seqid="chr2L", start=7529, end=12500), 27), + # since default completely_within=False, this catches anything that + # falls after 7680. So it only excludes the 5'UTR, which ends at 7679. + (dict(seqid="chr2L", start=7680), 26), + # but completely_within=True will exclude the gene and mRNAs, first + # exon and the 5'UTR + (dict(seqid="chr2L", start=7680, completely_within=True), 22), + # similarly, this will *exclude* anything before 7680 + (dict(seqid="chr2L", end=7680), 5), + # and also similarly, this will only get us the 5'UTR which is the only + # feature falling completely before 7680 + (dict(seqid="chr2L", end=7680, completely_within=True), 1), + # and there's only features from chr2L in this file, so this catches + # everything too + (dict(region="chr2L"), 27), + # using seqid should work similarly to `region` with only chromosome + (dict(seqid="chr2L"), 27), + # nonexistent + (dict(region="nowhere"), 0), +]) +def test_region(kwargs, expected): db_fname = gffutils.example_filename("FBgn0031208.gff") db = gffutils.create_db(db_fname, ":memory:", keep_order=True) - def _check(item): - kwargs, expected = item - try: - obs = list(db.region(**kwargs)) - assert len(obs) == expected, "expected %s got %s" % (expected, len(obs)) - except expected: - pass - - regions = [ - # previously failed, see issue #45 - (dict(seqid="chr2L", start=1, end=2e9, completely_within=True), 27), - (dict(region="chr2L", start=0), ValueError), - (dict(region="chr2L", end=0), ValueError), - (dict(region="chr2L", seqid=0), ValueError), - # these coords should catch everything - (dict(region="chr2L:7529-12500"), 27), - # stranded versions: - (dict(region="chr2L:7529-12500", strand="."), 0), - (dict(region="chr2L:7529-12500", strand="+"), 21), - (dict(region="chr2L:7529-12500", strand="-"), 6), - # different ways of selecting only that last exon in the last gene: - (dict(seqid="chr2L", start=11500, featuretype="exon"), 1), - (dict(seqid="chr2L", start=9500, featuretype="exon", strand="+"), 1), - # alternative method - (dict(seqid="chr2L", start=7529, end=12500), 27), - # since default completely_within=False, this catches anything that - # falls after 7680. So it only excludes the 5'UTR, which ends at 7679. - (dict(seqid="chr2L", start=7680), 26), - # but completely_within=True will exclude the gene and mRNAs, first - # exon and the 5'UTR - (dict(seqid="chr2L", start=7680, completely_within=True), 22), - # similarly, this will *exclude* anything before 7680 - (dict(seqid="chr2L", end=7680), 5), - # and also similarly, this will only get us the 5'UTR which is the only - # feature falling completely before 7680 - (dict(seqid="chr2L", end=7680, completely_within=True), 1), - # and there's only features from chr2L in this file, so this catches - # everything too - (dict(region="chr2L"), 27), - # using seqid should work similarly to `region` with only chromosome - (dict(seqid="chr2L"), 27), - # nonexistent - (dict(region="nowhere"), 0), - ] - - for item in regions: - yield _check, item + try: + obs = list(db.region(**kwargs)) + assert len(obs) == expected, "expected %s got %s" % (expected, len(obs)) + except expected: + pass def test_nonascii(): diff --git a/gffutils/test/test_biopython_integration.py b/gffutils/test/test_biopython_integration.py index 58c5866a..e9f8e81d 100644 --- a/gffutils/test/test_biopython_integration.py +++ b/gffutils/test/test_biopython_integration.py @@ -1,6 +1,10 @@ from gffutils import example_filename import gffutils import gffutils.biopython_integration as bp +import pytest + +# Skip tests entirely if BioPython not available +pytest.importorskip('Bio') def test_roundtrip(): diff --git a/gffutils/test/test_issues.py b/gffutils/test/test_issues.py index 79996ba5..64deda16 100644 --- a/gffutils/test/test_issues.py +++ b/gffutils/test/test_issues.py @@ -200,7 +200,10 @@ def test_pr_144(): assert f.attributes["a"] == [""] assert str(f) == ". . . . . . . . a" g = gffutils.feature.feature_from_line(str(f)) - assert g == f + g.dialect["fmt"] = "gff3" + print(g.attributes) + print(g.dialect) + assert str(g) == str(f) def test_pr_172(): @@ -452,21 +455,43 @@ def test_issue_198(): assert f.attributes["description"] == ["WASP family homolog 7, pseudogene"] - # If we remove one of the db_xref keys, then the parser sees the comma and - # figures it's a multivalue key. + # If we remove one of the db_xref keys, then previously the parser saw the + # comma and figured it was a multivalue key, and split it. Now, it's + # correctly identified as a single-value key. + # + # Note that we still have gene_synonym as a repeated key. line = 'NC_000001.11 BestRefSeq gene 14362 29370 . - . gene_id "WASH7P"; transcript_id ""; db_xref "GeneID:653635"; description "WASP family homolog 7, pseudogene"; gbkey "Gene"; gene "WASH7P"; gene_biotype "transcribed_pseudogene"; gene_synonym "FAM39F"; gene_synonym "WASH5P"; pseudo "true";' f = feature.feature_from_line(line) + assert f.dialect["repeated keys"] # Previous result, note leading space --------------------------->| | # assert f.attributes['description'] == ['WASP family homolog 7', ' pseudogene'] + + # Current result: not split. assert f.attributes["description"] == ["WASP family homolog 7, pseudogene"] - # But removing that space before "pseudogene" means it's interpreted as - # a multivalue attribute + # Removing that space before "pseudogene" might mean it's a multivalue, but + # we decide on the convention that if keys are repeated at all, that wins. + # So we still don't split line = 'NC_000001.11 BestRefSeq gene 14362 29370 . - . gene_id "WASH7P"; transcript_id ""; db_xref "GeneID:653635"; description "WASP family homolog 7,pseudogene"; gbkey "Gene"; gene "WASH7P"; gene_biotype "transcribed_pseudogene"; gene_synonym "FAM39F"; gene_synonym "WASH5P"; pseudo "true";' f = feature.feature_from_line(line) + assert f.dialect["repeated keys"] + assert f.attributes["description"] == ["WASP family homolog 7,pseudogene"] + + # But if we get rid of all repeated keys, it's interpreted as multiple values + line = 'NC_000001.11 BestRefSeq gene 14362 29370 . - . gene_id "WASH7P"; transcript_id ""; db_xref "GeneID:653635"; description "WASP family homolog 7,pseudogene"; gbkey "Gene"; gene "WASH7P"; gene_biotype "transcribed_pseudogene"; gene_synonym "FAM39F"; pseudo "true";' + f = feature.feature_from_line(line) + assert not f.dialect["repeated keys"] assert f.attributes["description"] == ["WASP family homolog 7", "pseudogene"] + # ....but if there's a ", " (comma followed by space) instead of just + # comma, then it's not split. + line = 'NC_000001.11 BestRefSeq gene 14362 29370 . - . gene_id "WASH7P"; transcript_id ""; db_xref "GeneID:653635"; description "WASP family homolog 7, pseudogene"; gbkey "Gene"; gene "WASH7P"; gene_biotype "transcribed_pseudogene"; gene_synonym "FAM39F"; pseudo "true";' + f = feature.feature_from_line(line) + assert not f.dialect["repeated keys"] + assert f.attributes["description"] == ["WASP family homolog 7, pseudogene"] + + # Confirm behavior of corner cases like a trailing comma line = "chr17 RefSeq CDS 6806527 6806553 . + 0 Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1," f = feature.feature_from_line(line) @@ -612,3 +637,18 @@ def test_issue_213(): db = gffutils.create_db(tmp, dbfn="issue_213.db", force=True) assert db.directives == ["gff-version 3"], db.directives assert len(db.directives) == 1 + +def test_issue_212(): + + + data = dedent( + """ + NC_000964.3 RefSeq CDS 410 1747 . + 0 gene_id "BSU_00010"; transcript_id "unassigned_transcript_1"; db_xref "EnsemblGenomes-Gn:BSU00010"; db_xref "EnsemblGenomes-Tr:CAB11777"; db_xref "GOA:P05648"; db_xref "InterPro:IPR001957"; db_xref "InterPro:IPR003593"; db_xref "InterPro:IPR010921"; db_xref "InterPro:IPR013159"; db_xref "InterPro:IPR013317"; db_xref "InterPro:IPR018312"; db_xref "InterPro:IPR020591"; db_xref "InterPro:IPR024633"; db_xref "InterPro:IPR027417"; db_xref "PDB:4TPS"; db_xref "SubtiList:BG10065"; db_xref "UniProtKB/Swiss-Prot:P05648"; db_xref "GenBank:NP_387882.1"; db_xref "GeneID:939978"; experiment "publication(s) with functional evidences, PMID:2167836, 2846289, 12682299, 16120674, 1779750, 28166228"; gbkey "CDS"; gene "dnaA"; locus_tag "BSU_00010"; note "Evidence 1a: Function from experimental evidences in the studied strain; PubMedId: 2167836, 2846289, 12682299, 16120674, 1779750, 28166228; Product type f : factor"; product "chromosomal replication initiator informational ATPase"; protein_id "NP_387882.1"; transl_table "11"; exon_number "1"; + """ + ) + inferred_dialect = gffutils.helpers.infer_dialect(data.split('\t')[-1]) + assert inferred_dialect["semicolon in quotes"] + + f = next(iter(gffutils.DataIterator(data, from_string=True, dialect=inferred_dialect))) + assert f.dialect["semicolon in quotes"] + assert f.attributes["note"] == ["Evidence 1a: Function from experimental evidences in the studied strain; PubMedId: 2167836, 2846289, 12682299, 16120674, 1779750, 28166228; Product type f : factor"]