o-william-white · srisarya · Apr 2, 2025 · May 20, 2025 · May 22, 2025 · May 23, 2025
diff --git a/README.md b/README.md
@@ -25,24 +25,26 @@ Below is a summary of the main input parameters.
   --seed SEED           Seed used for subsampling.
   --output OUTPUT       Output directory.
   --overwrite           Overwrite output directory.
+  --append              Append results into pre-existing directories (if running >1)
   --getorganelle        Format seed and gene database for get organelle.
   --email EMAIL         Email for Entrez.
   --api API             API key for NCBI. Optional.
 ```
 
 ### How it works
 
-The user provides a taxonomy as a taxonomic name (e.g. "Arabidopsis thaliana") or NCBI taxonomy ID (e.g. "3702"). However, **it is reccomended to use NCBI taxonomy IDs**, as some unrelated groups share names, for example the name "Drosophila" is used for both a genus of fruit flies and basidiomycete fungi. 
+The user provides a taxonomy as a taxonomic name (e.g. "Arabidopsis thaliana") or NCBI taxonomy ID (e.g. "3702"). However, **it is recommended to use NCBI taxonomy IDs**, as some unrelated groups share names, for example the name "Drosophila" is used for both a genus of fruit flies and basidiomycete fungi.
 
-The script will count the number or target sequences available for the user given taxonomy. If there are not enough sequence available based on the `--min` parameter, the script will count the number of sequences available for the parent rank, until the minimum threshold is reached. 
+The script will count the number or target sequences available for the user given taxonomy. If there are not enough sequence available based on the `--min` parameter, the script will count the number of sequences available for the parent rank, until the minimum threshold is reached.
 
-If the maximum number of sequences is exceeded, based on the `--max` parameter, the script will subsample the children lineages, aiming for an even sampling across children. 
+If the maximum number of sequences is exceeded, based on the `--max` parameter, the script will subsample the children lineages, aiming for an even sampling across children.
 
-Note that the `--target` option `ribosomal` will search for sequences with any of the ribosomal annotations (28S/25S, 18S, 5.8S), whereas `ribosomal_complete` will try to find sequences with all annotations. 
+Note that the `--target` option `ribosomal` will search for sequences with any of the ribosomal annotations (28S/25S, 18S, 5.8S), whereas `ribosomal_complete` will try to find sequences with all annotations.
 
 ### Example usage
 
-Below are simple examples of how to use the script. 
+Below are simple examples of how to use the script.
+
 ```
 # Arabidopsis thaliana chloroplast
 ./go_fetch.py \

diff --git a/go_fetch.py b/go_fetch.py
@@ -22,13 +22,14 @@
 # argparse
 parser = argparse.ArgumentParser(prog = "go_batch.py", description=argparse_description, usage=argparse_usage)
 parser.add_argument("--taxonomy",     help="Taxonomy of rank to search for e.g. \"Arabidopsis\"", type=str, required=True)
-parser.add_argument("--target",       help="Target sequence type.", choices=["chloroplast", "mitochondrion", "ribosomal", "ribosomal_complete"], required=True)
+parser.add_argument("--target",       help="Target sequence type.", choices=["chloroplast", "mitochondrion", "CO1", "ribosomal_any", "ribosomal_complete", "ribosomal_28S"], required=True)
 parser.add_argument("--db",           help="Database to search. Either refseq (NCBI) or genbank (INSDC). Default=refseq.", choices=["refseq", "genbank"], required=False, default="refseq")
 parser.add_argument("--min",          help="Minimum number of target sequences to download.", type=int, required=False)
 parser.add_argument("--max",          help="Maximum number of target sequences to download. Must be larger than --min.", type=int, required=False)
 parser.add_argument("--seed",         help="Seed used for subsampling.", type=int, required=False)
 parser.add_argument("--output",       help="Output directory.", required=False)
 parser.add_argument("--overwrite",    help="Overwrite output directory.", action="store_true", required=False)
+parser.add_argument("--append", help="Append to output directory if it exists.", action="store_true", required=False)
 parser.add_argument("--getorganelle", help="Format seed and gene database for get organelle.", action="store_true", required=False)
 parser.add_argument("--email",        help="Email for Entrez.", required=True)
 parser.add_argument("--api",          help="API for NCBI.", type=str, required=False)
@@ -82,16 +83,27 @@
 
 ### functions
 
-# create dir and overwrite if specified
-def create_dir(dirpath, overwrite):
+# create dir and overwrite or append if specified
+def create_dir(dirpath, overwrite, append):
     if os.path.exists(dirpath):
-        if overwrite == True:
+        if overwrite:
             shutil.rmtree(dirpath)
+            os.mkdir(dirpath)
+            os.mkdir(f"{dirpath}/fasta")
+            os.mkdir(f"{dirpath}/genbank")
+        elif append:
+            # Only create subfolders if they don't exist
+            if not os.path.exists(f"{dirpath}/fasta"):
+                os.mkdir(f"{dirpath}/fasta")
+            if not os.path.exists(f"{dirpath}/genbank"):
+                os.mkdir(f"{dirpath}/genbank")
+            # Do not remove anything
         else:
-            sys.exit(f"{dirpath} already exists. Remove or use --overwrite")
-    os.mkdir(dirpath)
-    os.mkdir(f"{dirpath}/fasta")
-    os.mkdir(f"{dirpath}/genbank")
+            sys.exit(f"{dirpath} already exists. Remove or use --overwrite or --append")
+    else:
+        os.mkdir(dirpath)
+        os.mkdir(f"{dirpath}/fasta")
+        os.mkdir(f"{dirpath}/genbank")
 
 # get taxonomic id from scientific name
 def get_taxonomic_id(taxonomy):
@@ -204,21 +216,25 @@ def print_phylogeny(main_lineage, optional_children = []):
             spacer = "|_" + spacer
         else:
             spacer = "  " + spacer
-    if optional_children is not []:
+    if optional_children != []:
         for c in optional_children:
             print(f"{spacer}{c}")
 
 # generate search term
-# target = "chloroplast", "mitochondrion", "ribosomal", "ribosomal_complete"
+# target = "chloroplast", "mitochondrion", "CO1", "ribosomal", "ribosomal_complete", "ribosomal_28S"
 def search_term(taxid, target, db):
     # add taxid to term
     term = f"{taxid}[Organism]"
     if target == "chloroplast" or target == "mitochondrion":
         term += f" AND {target}[Title] AND complete genome[Title]"
-    if target == "ribosomal":
-        term = f"({taxid}[Organism] AND (28S[Title] OR 25S[Title])) OR ({taxid}[Organism] AND 18S[Title]) OR ({taxid}[Organism] AND 5.8S[Title])"
+    if target == "CO1":
+        term += f" AND (CO1[Gene] OR COX1[Gene]) OR COI[Gene]"
+    if target == "ribosomal_any":
+        term = f"({taxid}[Organism] AND (28S[Gene] OR 25S[Gene])) OR ({taxid}[Organism] AND 18S[Gene]) OR ({taxid}[Organism] AND 5.8S[Gene])"
     if target == "ribosomal_complete":
-        term += f" AND (28S[Title] OR 25S[Title]) AND 18S[Title] AND 5.8S[Title]"
+        term += f" AND (28S[Gene] OR 25S[Gene]) AND 18S[Gene] AND 5.8S[Gene]"
+    if target == "ribosomal_28S":
+        term += f"({taxid}[Organism] AND (28S[Gene])"
     # refseq are derived from genbank but not part of
     if db == "refseq":
         term += f" AND refseq[filter]"
@@ -320,7 +336,7 @@ def recursive_search(taxonomy, lineage, target, db, min_th, max_th, idlist):
             print(f"Maximum threshold not exceeded. Downloading {count_idlist_total} sequences\n")
 
             print("\nCreating output directory")
-            create_dir(args.output, args.overwrite)
+            create_dir(args.output, args.overwrite, args.append)
 
             # efetch
             for i in idlist_combined:
@@ -345,7 +361,7 @@ def recursive_search(taxonomy, lineage, target, db, min_th, max_th, idlist):
                 print(f"Downloading the first {count_idlist_subsample} sequences\n")
 
                 print("Creating output directory")
-                create_dir(args.output, args.overwrite)
+                create_dir(args.output, args.overwrite, args.append)
 
                 # efetch
                 for i in idlist_combined[:count_idlist_subsample]:
@@ -401,7 +417,7 @@ def recursive_search(taxonomy, lineage, target, db, min_th, max_th, idlist):
                 list_subsample
 
                 print("\nCreating output directory")
-                create_dir(args.output, args.overwrite)
+                create_dir(args.output, args.overwrite, args.append)
 
                 list_download = idlist.copy()
                 list_download.extend(list_subsample)
@@ -461,7 +477,7 @@ def format_gene(path, target):
     if target == "mitochondrion" or target == "chloroplast":
         cmd_gar.extend(["-o", f"{path}/annotated_regions", "-t", "CDS", "--mix"])
     else:
-        if target == "ribosomal" or target == "ribosomal_complete":
+        if target == "ribosomal_any" or target == "ribosomal_complete" or target == "ribosomal_28S":
             cmd_gar.extend(["-o", f"{path}/annotated_regions", "-t", "rRNA", "--mix"])
     # subprocess run
     result_gar = subprocess.run(cmd_gar, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
@@ -522,6 +538,4 @@ def format_gene(path, target):
     format_seed(args.output)
     format_gene(args.output, args.target)
 
-print("\ngo_fetch complete!")
-
-
+print("\ngo_fetch complete!")