Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,26 @@ Below is a summary of the main input parameters.
--seed SEED Seed used for subsampling.
--output OUTPUT Output directory.
--overwrite Overwrite output directory.
--append Append results into pre-existing directories (if running >1)
--getorganelle Format seed and gene database for get organelle.
--email EMAIL Email for Entrez.
--api API API key for NCBI. Optional.
```

### How it works

The user provides a taxonomy as a taxonomic name (e.g. "Arabidopsis thaliana") or NCBI taxonomy ID (e.g. "3702"). However, **it is reccomended to use NCBI taxonomy IDs**, as some unrelated groups share names, for example the name "Drosophila" is used for both a genus of fruit flies and basidiomycete fungi.
The user provides a taxonomy as a taxonomic name (e.g. "Arabidopsis thaliana") or NCBI taxonomy ID (e.g. "3702"). However, **it is recommended to use NCBI taxonomy IDs**, as some unrelated groups share names, for example the name "Drosophila" is used for both a genus of fruit flies and basidiomycete fungi.

The script will count the number or target sequences available for the user given taxonomy. If there are not enough sequence available based on the `--min` parameter, the script will count the number of sequences available for the parent rank, until the minimum threshold is reached.
The script will count the number or target sequences available for the user given taxonomy. If there are not enough sequence available based on the `--min` parameter, the script will count the number of sequences available for the parent rank, until the minimum threshold is reached.

If the maximum number of sequences is exceeded, based on the `--max` parameter, the script will subsample the children lineages, aiming for an even sampling across children.
If the maximum number of sequences is exceeded, based on the `--max` parameter, the script will subsample the children lineages, aiming for an even sampling across children.

Note that the `--target` option `ribosomal` will search for sequences with any of the ribosomal annotations (28S/25S, 18S, 5.8S), whereas `ribosomal_complete` will try to find sequences with all annotations.
Note that the `--target` option `ribosomal` will search for sequences with any of the ribosomal annotations (28S/25S, 18S, 5.8S), whereas `ribosomal_complete` will try to find sequences with all annotations.

### Example usage

Below are simple examples of how to use the script.
Below are simple examples of how to use the script.

```
# Arabidopsis thaliana chloroplast
./go_fetch.py \
Expand Down
54 changes: 34 additions & 20 deletions go_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,14 @@
# argparse
parser = argparse.ArgumentParser(prog = "go_batch.py", description=argparse_description, usage=argparse_usage)
parser.add_argument("--taxonomy", help="Taxonomy of rank to search for e.g. \"Arabidopsis\"", type=str, required=True)
parser.add_argument("--target", help="Target sequence type.", choices=["chloroplast", "mitochondrion", "ribosomal", "ribosomal_complete"], required=True)
parser.add_argument("--target", help="Target sequence type.", choices=["chloroplast", "mitochondrion", "CO1", "ribosomal_any", "ribosomal_complete", "ribosomal_28S"], required=True)
parser.add_argument("--db", help="Database to search. Either refseq (NCBI) or genbank (INSDC). Default=refseq.", choices=["refseq", "genbank"], required=False, default="refseq")
parser.add_argument("--min", help="Minimum number of target sequences to download.", type=int, required=False)
parser.add_argument("--max", help="Maximum number of target sequences to download. Must be larger than --min.", type=int, required=False)
parser.add_argument("--seed", help="Seed used for subsampling.", type=int, required=False)
parser.add_argument("--output", help="Output directory.", required=False)
parser.add_argument("--overwrite", help="Overwrite output directory.", action="store_true", required=False)
parser.add_argument("--append", help="Append to output directory if it exists.", action="store_true", required=False)
parser.add_argument("--getorganelle", help="Format seed and gene database for get organelle.", action="store_true", required=False)
parser.add_argument("--email", help="Email for Entrez.", required=True)
parser.add_argument("--api", help="API for NCBI.", type=str, required=False)
Expand Down Expand Up @@ -82,16 +83,27 @@

### functions

# create dir and overwrite if specified
def create_dir(dirpath, overwrite):
# create dir and overwrite or append if specified
def create_dir(dirpath, overwrite, append):
if os.path.exists(dirpath):
if overwrite == True:
if overwrite:
shutil.rmtree(dirpath)
os.mkdir(dirpath)
os.mkdir(f"{dirpath}/fasta")
os.mkdir(f"{dirpath}/genbank")
elif append:
# Only create subfolders if they don't exist
if not os.path.exists(f"{dirpath}/fasta"):
os.mkdir(f"{dirpath}/fasta")
if not os.path.exists(f"{dirpath}/genbank"):
os.mkdir(f"{dirpath}/genbank")
# Do not remove anything
else:
sys.exit(f"{dirpath} already exists. Remove or use --overwrite")
os.mkdir(dirpath)
os.mkdir(f"{dirpath}/fasta")
os.mkdir(f"{dirpath}/genbank")
sys.exit(f"{dirpath} already exists. Remove or use --overwrite or --append")
else:
os.mkdir(dirpath)
os.mkdir(f"{dirpath}/fasta")
os.mkdir(f"{dirpath}/genbank")

# get taxonomic id from scientific name
def get_taxonomic_id(taxonomy):
Expand Down Expand Up @@ -204,21 +216,25 @@ def print_phylogeny(main_lineage, optional_children = []):
spacer = "|_" + spacer
else:
spacer = " " + spacer
if optional_children is not []:
if optional_children != []:
for c in optional_children:
print(f"{spacer}{c}")

# generate search term
# target = "chloroplast", "mitochondrion", "ribosomal", "ribosomal_complete"
# target = "chloroplast", "mitochondrion", "CO1", "ribosomal", "ribosomal_complete", "ribosomal_28S"
def search_term(taxid, target, db):
# add taxid to term
term = f"{taxid}[Organism]"
if target == "chloroplast" or target == "mitochondrion":
term += f" AND {target}[Title] AND complete genome[Title]"
if target == "ribosomal":
term = f"({taxid}[Organism] AND (28S[Title] OR 25S[Title])) OR ({taxid}[Organism] AND 18S[Title]) OR ({taxid}[Organism] AND 5.8S[Title])"
if target == "CO1":
term += f" AND (CO1[Gene] OR COX1[Gene]) OR COI[Gene]"
if target == "ribosomal_any":
term = f"({taxid}[Organism] AND (28S[Gene] OR 25S[Gene])) OR ({taxid}[Organism] AND 18S[Gene]) OR ({taxid}[Organism] AND 5.8S[Gene])"
if target == "ribosomal_complete":
term += f" AND (28S[Title] OR 25S[Title]) AND 18S[Title] AND 5.8S[Title]"
term += f" AND (28S[Gene] OR 25S[Gene]) AND 18S[Gene] AND 5.8S[Gene]"
if target == "ribosomal_28S":
term += f"({taxid}[Organism] AND (28S[Gene])"
# refseq are derived from genbank but not part of
if db == "refseq":
term += f" AND refseq[filter]"
Expand Down Expand Up @@ -320,7 +336,7 @@ def recursive_search(taxonomy, lineage, target, db, min_th, max_th, idlist):
print(f"Maximum threshold not exceeded. Downloading {count_idlist_total} sequences\n")

print("\nCreating output directory")
create_dir(args.output, args.overwrite)
create_dir(args.output, args.overwrite, args.append)

# efetch
for i in idlist_combined:
Expand All @@ -345,7 +361,7 @@ def recursive_search(taxonomy, lineage, target, db, min_th, max_th, idlist):
print(f"Downloading the first {count_idlist_subsample} sequences\n")

print("Creating output directory")
create_dir(args.output, args.overwrite)
create_dir(args.output, args.overwrite, args.append)

# efetch
for i in idlist_combined[:count_idlist_subsample]:
Expand Down Expand Up @@ -401,7 +417,7 @@ def recursive_search(taxonomy, lineage, target, db, min_th, max_th, idlist):
list_subsample

print("\nCreating output directory")
create_dir(args.output, args.overwrite)
create_dir(args.output, args.overwrite, args.append)

list_download = idlist.copy()
list_download.extend(list_subsample)
Expand Down Expand Up @@ -461,7 +477,7 @@ def format_gene(path, target):
if target == "mitochondrion" or target == "chloroplast":
cmd_gar.extend(["-o", f"{path}/annotated_regions", "-t", "CDS", "--mix"])
else:
if target == "ribosomal" or target == "ribosomal_complete":
if target == "ribosomal_any" or target == "ribosomal_complete" or target == "ribosomal_28S":
cmd_gar.extend(["-o", f"{path}/annotated_regions", "-t", "rRNA", "--mix"])
# subprocess run
result_gar = subprocess.run(cmd_gar, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
Expand Down Expand Up @@ -522,6 +538,4 @@ def format_gene(path, target):
format_seed(args.output)
format_gene(args.output, args.target)

print("\ngo_fetch complete!")


print("\ngo_fetch complete!")
Loading