EESI · Abdul-202 · Dec 3, 2025 · Dec 3, 2025
diff --git a/MicroTraitLLM_VQA/.gitignore b/MicroTraitLLM_VQA/.gitignore
@@ -0,0 +1,6 @@
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.DS_Store
+apikeys.txt
diff --git a/MicroTraitLLM_VQA/LICENSE b/MicroTraitLLM_VQA/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 grogers772
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/MicroTraitLLM_VQA/README.md b/MicroTraitLLM_VQA/README.md
@@ -0,0 +1,3 @@
+Rapid advances in Large Language Models (LLMs) and broad potential applications in biological research make them a compelling point of investigation. Key challenges remain, including the tendency of LLMs to hallucinate if they are not provided with domain-specific information, thereby potentially misinforming users; the small number of LLM applications directed to prokaryotes despite significant advances in microbiome studies, especially when compared to the many human-specific and eukaryotic domain-specific LLM application that have recently been released; and the inability of current microbe-specific LLM tools to provide sufficiently comprehensive, accurate, and timely answers with proper citations.
+
+We present MicroTraitLLM, a retrieval-augmented generation (RAG) LLM which utilizes zero-shot and single-shot prompting to give specific, citation-based answers for researchers. Its connection to the live-updating PubMed Central Open Access article database allows the tool to remain up-to-date on scientific knowledge. MicroTraitLLM flexibly allows the user to customize their experience by selecting their choice of LLMs. The tool is also able to generate accurate citations in various formats. We present empirical results demonstrating that MicroTraitLLM provides both improvements in relevant literature search and informative responses as judged by microbial experts, while not increasing latency time compared to popular commercial LLMs.
diff --git a/MicroTraitLLM_VQA/call_api.py b/MicroTraitLLM_VQA/call_api.py
@@ -0,0 +1,26 @@
+import requests
+
+def call_api(url):
+    # Function to call an API and return the data
+    # Handles exceptions and returns None if the request fails
+    try:
+        # Send a GET request to the URL
+        response = requests.get(url,timeout=60)
+
+        # Check if request was successful
+        response.raise_for_status()
+
+        # Return the data directly
+        return response.text
+
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching data from {url}: {e}")
+        return None
+
+    except ValueError as e:
+        print(f"Error parsing JSON from {url}: {e}")
+        return None
+
+
+# Copyright Sep 2025 Glen Rogers. 
+# Subject to MIT license.
diff --git a/MicroTraitLLM_VQA/citations.py b/MicroTraitLLM_VQA/citations.py
@@ -0,0 +1,51 @@
+def APA_citation(info):
+    # Function to generate an APA citation from the provided info dictionary
+    # The info dictionary is usually obtained from the function extract_info() contained in pmc_text_api.py
+    authorlist = ""
+    for author in info['names']:
+        surname, given_names = author.split(';')
+        surname = surname.split(':')[1]
+        given_names = given_names.split(':')[1]
+        given_names = given_names[0]
+        if author == info['names'][-1]:
+            authorlist = ''.join([authorlist,"& " + surname + ", " + given_names])
+        else:
+            authorlist = ''.join([authorlist, surname + ", " + given_names + "., "])
+
+    pub_year = info['publication_date'].split(' ')[0]
+
+    reference = f"{authorlist}. ({pub_year}). {info['title']}. <i>{info['journal']}</i>, {info['volume']}({info['issue']}), {info['first_page']}. {info['doi']}"
+
+    return reference
+
+def MLA_citation(info):
+     # Function to generate an MLA citation from the provided info dictionary
+    # The info dictionary is usually obtained from the function extract_info() contained in pmc_text_api.py
+    main_author = info['names'][0]
+    surname, given_name = main_author.split(';')
+    surname = surname.split(':')[1]
+    given_name = given_name.split(':')[1]
+    main_author = f"{surname +', ' + given_name}" 
+    pub_date = info['publication_date'].split(' ')
+    pub_date = f"{pub_date[2]} {pub_date[1]}. {pub_date[0]}"
+    reference = f'{main_author} et al. "{info["title"]}." <i>{info["journal"]}</i> vol. {info["volume"]}, {info["first_page"]}. {pub_date}, doi:{info["doi"]}'
+
+    return reference
+
+def NLM_citation(info):
+     # Function to generate an NLM citation from the provided info dictionary
+    # The info dictionary is usually obtained from the function extract_info() contained in pmc_text_api.py
+    authorlist = ""
+    for author in info['names']:
+        surname, given_names = author.split(';')
+        surname = surname.split(':')[1]
+        given_names = given_names.split(':')[1]
+        given_names = given_names[0]
+        authorlist = ''.join([authorlist, surname + " " + given_names + ", "])
+
+    authorlist = authorlist[:-2]  # Remove the last comma and space
+    reference = f"{authorlist}. {info['title']}. <i>{info['journal']}</i>. {info['publication_date']};{info['volume']}:{info['first_page']}. doi: {info['doi']}. PMID: {info['pmid']}; PMCID: {info['pmcid']}."
+    return reference
+
+# Copyright Sep 2025 Glen Rogers. 
+# Subject to MIT license.
diff --git a/MicroTraitLLM_VQA/compile_supplement_generation.py b/MicroTraitLLM_VQA/compile_supplement_generation.py
@@ -0,0 +1,127 @@
+from openai import OpenAI
+import os
+from openai import OpenAI
+from ollama import chat
+import inspect
+
+def generate_summary(papers,question,model,citation_format,citations,temperature,api_key):
+    # Function to generate a summary of summaries based on the provided papers and user input
+    # Definte the context and example for the LLM
+    Context = f"""You are an expert in microbial metagenomics and microbial traits. You are tasked with answering the question provided by the user. All information required to answer the question will be provided to you in a large text format as a list. Each entry in the list will contain an article summary, a grade for how well the article answers the user question, and the citation for the article in {citation_format} format. \n
+    Your answer should be a detailed paragraph consisting of 5-10 sentences answering the user question. Additionally, when writing your response, every source you use should be cited in an in-text format within your response in {citation_format} format. You should never have a references section at the end of your response. \n
+    If you cannot cite the given information in {citation_format} format, please instead list the article title. \n
+    You should prioritize using the articles with a higher grade over the ones with a lower grade. \n
+    You must answer the user question to the best of your ability without using any other information besides the information provided to you. \n"""
+
+    EX1 = 'Question: What are some bacterial strains associated with ear infections?\n'
+
+    A1 = """Ear infections, also known as otitis media, can be caused by various bacterial strains. Some common bacterial strains associated with ear infections include Streptococcus pneumoniae, Haemophilus influenzae, and Moraxella catarrhalis (Schilder et al. 2016). These bacteria are often found in the upper respiratory tract and can migrate to the middle ear, leading to infection. Streptococcus pneumoniae is one of the most common bacterial pathogens causing ear infections, particularly in children. Haemophilus influenzae is another significant contributor to ear infections, especially in cases where the pneumococcal vaccine has been effective in reducing Streptococcus pneumoniae infections (Kaur et al 2017). Moraxella catarrhalis is also known to be involved in ear infections, particularly in cases of chronic otitis media. Understanding the bacterial strains associated with ear infections is crucial for appropriate diagnosis and treatment strategies.\n
+    """
+
+    all_messages = [
+                {"role": "system", "content": Context},
+                {"role": "user", "content": f"An example question is {EX1}"},
+                {"role": "assistant", "content": f"An example output is: {A1}"},
+                {"role": "user", "content": f"The information to answer the given question is: {papers}"},
+                {"role": "user", "content": question}
+            ]
+    # Check the model type and call the appropriate LLM to create the summary of summaries
+    if model == "ChatGPT-4o-mini":
+        client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", api_key))
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=all_messages,
+            temperature=temperature,
+        )
+        final_response = response.choices[0].message.content
+
+    elif model == "llama-3.3-70b-versatile":
+        client = OpenAI(base_url = "https://api.groq.com/openai/v1",api_key=os.environ.get("GROQ_API_KEY", api_key))
+        response = client.chat.completions.create(
+            model=model, 
+            messages=all_messages,
+            temperature=temperature)
+        final_response = response.choices[0].message.content
+    else:
+        response = chat(
+            model=model,
+            messages=all_messages,
+            options = {'temperature': temperature})
+        final_response = response.message.content
+    # Append the citations to the final response
+    stack = inspect.stack()
+    call_fun = stack[-2].function
+    if call_fun == "ask":
+        final_response = final_response + f"<br><br>References: <br>{citations}"
+    else:
+        final_response = final_response + f"\n\nReferences: \n{citations}"
+    return final_response
+
+def generate_supplement(final_response,model,temperature,api_key):
+    # Function to generate a supplement from the final response
+    # This is used to extract terms from the final response
+    # Define the context and example for the LLM
+    context = """Your job is to read the provided paragraph, and note the species, genes, or proteins referenced inside the paragraph.
+            If they are, you are to list each within its own respective Python list. The following order for the lists should always be used: Taxonomy, then Protein, then Gene.
+            The genus, species, and subspecies for one organism should constitute one term in the list.
+            If only the genus is listed, exclude it from the list. The first term in each list must always be the term you are looking for, which in this case is Taxonomy, Gene, or Protein respectively.
+            If you do not identify any organisms, genes, or proteins, instead return "None" for the category list. Never abbreviate the organism name, gene, or protein name.
+            If an abbreviated species name is provided, omit it from the output list you are creating. Ignore any cases with a prime symbol in them, such as 'aph(3')-Ia'.
+            """
+    EX3 = """In Escherichia coli (E. coli), several genes are commonly associated with antibiotic resistance, 
+    reflecting the bacterium's ability to evade the effects of various antibiotics. 
+    Notably, the **blaCTX-M**, **blaTEM**, and **blaSHV** genes encode for extended-spectrum 
+    beta-lactamases (ESBLs), which confer resistance to a wide range of beta-lactam antibiotics 
+    (Ahmad, Joji, & Shahid, 2022). Additionally, the **mcr-1** gene is significant for providing 
+    resistance to colistin, a last-resort antibiotic for treating multidrug-resistant infections 
+    (Nasrollahian, Graham, & Halaji, 2024). Other important resistance genes include **aac(3)-Ib-cr**, 
+    which is linked to aminoglycoside resistance, and **qnr** genes that protect against fluoroquinolones 
+    by encoding proteins that shield target enzymes from antibiotic action (Nasrollahian et al., 2024). 
+    Furthermore, the **sul1**, **sul2**, and **sul3** genes are associated with sulfonamide resistance, 
+    while **tetA** and **tetB** are linked to tetracycline resistance (Ribeiro et al., 2023). The presence 
+    of these genes highlights the genetic diversity and complexity of antibiotic resistance mechanisms in 
+    E. coli, emphasizing the need for ongoing surveillance and management strategies to combat this public 
+    health challenge (Silva et al., 2024).
+    """
+    A3 = """[["Taxonomy", "Escherichia coli"],["Gene", "blaCTX-M", "blaTEM", "blaSHV","mcr-1","aac(3)-Ib-cr","qnr","sul1", "sul2","sul3","tetA","tetB"]]"""
+    # Check the model type and call the appropriate LLM to create the supplement
+    if model == "ChatGPT-4o-mini":
+        client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", api_key))
+        supplement = client.chat.completions.create(
+            model="gpt-4o-mini", 
+            messages=[
+                {"role": "system", "content": context},
+                {"role": "user","content": EX3},
+                {"role": "assistant", "content": A3},
+                {"role": "user", "content": f"The paragraph is: {final_response}"},
+                    ],
+            temperature=temperature,
+            ) 
+        termlist = supplement.choices[0].message.content
+    elif model == "llama-3.3-70b-versatile":
+        client = OpenAI(base_url = "https://api.groq.com/openai/v1",api_key=os.environ.get("GROQ_API_KEY", api_key))
+        supplement = client.chat.completions.create(
+            model=model, 
+            messages=[
+                {"role": "system", "content": context},
+                {"role": "user","content": EX3},
+                {"role": "assistant", "content": A3},
+                {"role": "user", "content": f"The paragraph is: {final_response}"},
+                    ],
+            temperature=temperature)
+        termlist = supplement.choices[0].message.content
+    else:
+        supplement = chat(
+            model=model,
+            messages=[
+                {"role": "system", "content": context},
+                {"role": "user","content": EX3},
+                {"role": "assistant", "content": A3},
+                {"role": "user", "content": f"The paragraph is: {final_response}"},
+                    ],
+            options = {'temperature': temperature})
+        termlist = supplement.message.content
+    return termlist
+
+# Copyright Sep 2025 Glen Rogers. 
+# Subject to MIT license.
diff --git a/MicroTraitLLM_VQA/figure_ingest.py b/MicroTraitLLM_VQA/figure_ingest.py
@@ -0,0 +1,147 @@
+import os
+import uuid
+import json
+import base64
+from typing import Dict, Any
+
+from openai import OpenAI
+from read_api_keys import load_api_keys
+
+# Storage locations
+DATA_DIR = "data"
+FIGURE_DIR = os.path.join(DATA_DIR, "figures")
+FIGURE_INDEX_PATH = os.path.join(DATA_DIR, "figure_index.jsonl")
+
+# Make sure directories exist
+os.makedirs(FIGURE_DIR, exist_ok=True)
+
+# OpenAI client for vision + embeddings
+_api_keys = load_api_keys("apikeys.txt")
+_api_key_openai = _api_keys.get("API_KEY_OPENAI")
+client = OpenAI(api_key=_api_key_openai)
+
+
+def _extract_metadata_from_image(image_bytes: bytes) -> Dict[str, Any]:
+    """
+    Call a multimodal model to extract:
+      - caption: concise description of the figure
+      - ocr_text: any text / labels it can read
+      - chart_data: table-like structure of key numbers, if present
+    Returns a dict.
+    """
+    if not _api_key_openai:
+        raise RuntimeError("API_KEY_OPENAI missing in apikeys.txt")
+
+    encoded = base64.b64encode(image_bytes).decode("utf-8")
+    data_url = f"data:image/png;base64,{encoded}"
+
+    resp = client.chat.completions.create(
+        model="gpt-4o-mini",
+        response_format={"type": "json_object"},
+        messages=[
+            {
+                "role": "system",
+                "content": (
+                    "You are a vision model analyzing scientific figures. "
+                    "Return a JSON object with fields: "
+                    "`caption` (string), "
+                    "`ocr_text` (string of all readable text), and "
+                    "`chart_data` (an array of rows; each row an object with keys like "
+                    "axis_labels, series, values, units, etc.). "
+                    "If something is missing, use an empty string or empty array."
+                ),
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "Analyze this figure and extract caption, OCR text, and structured chart/table data as JSON.",
+                    },
+                    {"type": "image_url", "image_url": {"url": data_url}},
+                ],
+            },
+        ],
+        temperature=0.0,
+    )
+
+    content = resp.choices[0].message.content
+    try:
+        metadata = json.loads(content)
+    except json.JSONDecodeError:
+        # Fallback: wrap raw content
+        metadata = {
+            "caption": content,
+            "ocr_text": "",
+            "chart_data": [],
+        }
+
+    # Ensure keys exist
+    metadata.setdefault("caption", "")
+    metadata.setdefault("ocr_text", "")
+    metadata.setdefault("chart_data", [])
+
+    return metadata
+
+
+def _embed_text(text: str) -> list:
+    """
+    Get a text embedding for caption + OCR text.
+    Used for the text index (and as a stand-in for an image embedding for now).
+    """
+    if not _api_key_openai:
+        raise RuntimeError("API_KEY_OPENAI missing in apikeys.txt")
+
+    resp = client.embeddings.create(
+        model="text-embedding-3-small",
+        input=[text],
+    )
+    return resp.data[0].embedding
+
+
+def ingest_figure(image_bytes: bytes, paper_id: str | None = None) -> Dict[str, Any]:
+    """
+    Main ingestion entry point.
+
+    1) Run vision model to get metadata (caption, ocr_text, chart_data).
+    2) Compute embeddings.
+    3) Save original image to disk.
+    4) Append a record to figure_index.jsonl.
+
+    Returns the stored record, including a generated figure_id.
+    """
+    # Step 1: metadata from vision model
+    metadata = _extract_metadata_from_image(image_bytes)
+
+    # Step 2: embeddings
+    text_for_embedding = (metadata.get("caption", "") + "\n" +
+                          metadata.get("ocr_text", "")).strip()
+    text_embedding = _embed_text(text_for_embedding) if text_for_embedding else []
+
+    # For now, we'll just reuse text_embedding for image_index as well.
+    image_embedding = text_embedding
+
+    # Step 3: save original image
+    figure_id = str(uuid.uuid4())
+    image_filename = f"{figure_id}.png"
+    image_path = os.path.join(FIGURE_DIR, image_filename)
+    with open(image_path, "wb") as f:
+        f.write(image_bytes)
+
+    # Step 4: build record
+    record: Dict[str, Any] = {
+        "figure_id": figure_id,
+        "paper_id": paper_id,
+        "caption": metadata.get("caption", ""),
+        "ocr_text": metadata.get("ocr_text", ""),
+        "chart_data": metadata.get("chart_data", []),
+        "text_embedding": text_embedding,
+        "image_embedding": image_embedding,
+        "image_path": image_path,
+    }
+
+    # Append to JSONL index
+    with open(FIGURE_INDEX_PATH, "a", encoding="utf-8") as f:
+        f.write(json.dumps(record) + "\n")
+
+    return record
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		Rapid advances in Large Language Models (LLMs) and broad potential applications in biological research make them a compelling point of investigation. Key challenges remain, including the tendency of LLMs to hallucinate if they are not provided with domain-specific information, thereby potentially misinforming users; the small number of LLM applications directed to prokaryotes despite significant advances in microbiome studies, especially when compared to the many human-specific and eukaryotic domain-specific LLM application that have recently been released; and the inability of current microbe-specific LLM tools to provide sufficiently comprehensive, accurate, and timely answers with proper citations.

		We present MicroTraitLLM, a retrieval-augmented generation (RAG) LLM which utilizes zero-shot and single-shot prompting to give specific, citation-based answers for researchers. Its connection to the live-updating PubMed Central Open Access article database allows the tool to remain up-to-date on scientific knowledge. MicroTraitLLM flexibly allows the user to customize their experience by selecting their choice of LLMs. The tool is also able to generate accurate citations in various formats. We present empirical results demonstrating that MicroTraitLLM provides both improvements in relevant literature search and informative responses as judged by microbial experts, while not increasing latency time compared to popular commercial LLMs.