Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions MicroTraitLLM_VQA/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
__pycache__/
*.pyc
*.pyo
*.pyd
.DS_Store
apikeys.txt
21 changes: 21 additions & 0 deletions MicroTraitLLM_VQA/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2025 grogers772

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
3 changes: 3 additions & 0 deletions MicroTraitLLM_VQA/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Rapid advances in Large Language Models (LLMs) and broad potential applications in biological research make them a compelling point of investigation. Key challenges remain, including the tendency of LLMs to hallucinate if they are not provided with domain-specific information, thereby potentially misinforming users; the small number of LLM applications directed to prokaryotes despite significant advances in microbiome studies, especially when compared to the many human-specific and eukaryotic domain-specific LLM application that have recently been released; and the inability of current microbe-specific LLM tools to provide sufficiently comprehensive, accurate, and timely answers with proper citations.

We present MicroTraitLLM, a retrieval-augmented generation (RAG) LLM which utilizes zero-shot and single-shot prompting to give specific, citation-based answers for researchers. Its connection to the live-updating PubMed Central Open Access article database allows the tool to remain up-to-date on scientific knowledge. MicroTraitLLM flexibly allows the user to customize their experience by selecting their choice of LLMs. The tool is also able to generate accurate citations in various formats. We present empirical results demonstrating that MicroTraitLLM provides both improvements in relevant literature search and informative responses as judged by microbial experts, while not increasing latency time compared to popular commercial LLMs.
26 changes: 26 additions & 0 deletions MicroTraitLLM_VQA/call_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import requests

def call_api(url):
# Function to call an API and return the data
# Handles exceptions and returns None if the request fails
try:
# Send a GET request to the URL
response = requests.get(url,timeout=60)

# Check if request was successful
response.raise_for_status()

# Return the data directly
return response.text

except requests.exceptions.RequestException as e:
print(f"Error fetching data from {url}: {e}")
return None

except ValueError as e:
print(f"Error parsing JSON from {url}: {e}")
return None


# Copyright Sep 2025 Glen Rogers.
# Subject to MIT license.
51 changes: 51 additions & 0 deletions MicroTraitLLM_VQA/citations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
def APA_citation(info):
# Function to generate an APA citation from the provided info dictionary
# The info dictionary is usually obtained from the function extract_info() contained in pmc_text_api.py
authorlist = ""
for author in info['names']:
surname, given_names = author.split(';')
surname = surname.split(':')[1]
given_names = given_names.split(':')[1]
given_names = given_names[0]
if author == info['names'][-1]:
authorlist = ''.join([authorlist,"& " + surname + ", " + given_names])
else:
authorlist = ''.join([authorlist, surname + ", " + given_names + "., "])

pub_year = info['publication_date'].split(' ')[0]

reference = f"{authorlist}. ({pub_year}). {info['title']}. <i>{info['journal']}</i>, {info['volume']}({info['issue']}), {info['first_page']}. {info['doi']}"

return reference

def MLA_citation(info):
# Function to generate an MLA citation from the provided info dictionary
# The info dictionary is usually obtained from the function extract_info() contained in pmc_text_api.py
main_author = info['names'][0]
surname, given_name = main_author.split(';')
surname = surname.split(':')[1]
given_name = given_name.split(':')[1]
main_author = f"{surname +', ' + given_name}"
pub_date = info['publication_date'].split(' ')
pub_date = f"{pub_date[2]} {pub_date[1]}. {pub_date[0]}"
reference = f'{main_author} et al. "{info["title"]}." <i>{info["journal"]}</i> vol. {info["volume"]}, {info["first_page"]}. {pub_date}, doi:{info["doi"]}'

return reference

def NLM_citation(info):
# Function to generate an NLM citation from the provided info dictionary
# The info dictionary is usually obtained from the function extract_info() contained in pmc_text_api.py
authorlist = ""
for author in info['names']:
surname, given_names = author.split(';')
surname = surname.split(':')[1]
given_names = given_names.split(':')[1]
given_names = given_names[0]
authorlist = ''.join([authorlist, surname + " " + given_names + ", "])

authorlist = authorlist[:-2] # Remove the last comma and space
reference = f"{authorlist}. {info['title']}. <i>{info['journal']}</i>. {info['publication_date']};{info['volume']}:{info['first_page']}. doi: {info['doi']}. PMID: {info['pmid']}; PMCID: {info['pmcid']}."
return reference

# Copyright Sep 2025 Glen Rogers.
# Subject to MIT license.
127 changes: 127 additions & 0 deletions MicroTraitLLM_VQA/compile_supplement_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
from openai import OpenAI
import os
from openai import OpenAI
from ollama import chat
import inspect

def generate_summary(papers,question,model,citation_format,citations,temperature,api_key):
# Function to generate a summary of summaries based on the provided papers and user input
# Definte the context and example for the LLM
Context = f"""You are an expert in microbial metagenomics and microbial traits. You are tasked with answering the question provided by the user. All information required to answer the question will be provided to you in a large text format as a list. Each entry in the list will contain an article summary, a grade for how well the article answers the user question, and the citation for the article in {citation_format} format. \n
Your answer should be a detailed paragraph consisting of 5-10 sentences answering the user question. Additionally, when writing your response, every source you use should be cited in an in-text format within your response in {citation_format} format. You should never have a references section at the end of your response. \n
If you cannot cite the given information in {citation_format} format, please instead list the article title. \n
You should prioritize using the articles with a higher grade over the ones with a lower grade. \n
You must answer the user question to the best of your ability without using any other information besides the information provided to you. \n"""

EX1 = 'Question: What are some bacterial strains associated with ear infections?\n'

A1 = """Ear infections, also known as otitis media, can be caused by various bacterial strains. Some common bacterial strains associated with ear infections include Streptococcus pneumoniae, Haemophilus influenzae, and Moraxella catarrhalis (Schilder et al. 2016). These bacteria are often found in the upper respiratory tract and can migrate to the middle ear, leading to infection. Streptococcus pneumoniae is one of the most common bacterial pathogens causing ear infections, particularly in children. Haemophilus influenzae is another significant contributor to ear infections, especially in cases where the pneumococcal vaccine has been effective in reducing Streptococcus pneumoniae infections (Kaur et al 2017). Moraxella catarrhalis is also known to be involved in ear infections, particularly in cases of chronic otitis media. Understanding the bacterial strains associated with ear infections is crucial for appropriate diagnosis and treatment strategies.\n
"""

all_messages = [
{"role": "system", "content": Context},
{"role": "user", "content": f"An example question is {EX1}"},
{"role": "assistant", "content": f"An example output is: {A1}"},
{"role": "user", "content": f"The information to answer the given question is: {papers}"},
{"role": "user", "content": question}
]
# Check the model type and call the appropriate LLM to create the summary of summaries
if model == "ChatGPT-4o-mini":
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", api_key))
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=all_messages,
temperature=temperature,
)
final_response = response.choices[0].message.content

elif model == "llama-3.3-70b-versatile":
client = OpenAI(base_url = "https://api.groq.com/openai/v1",api_key=os.environ.get("GROQ_API_KEY", api_key))
response = client.chat.completions.create(
model=model,
messages=all_messages,
temperature=temperature)
final_response = response.choices[0].message.content
else:
response = chat(
model=model,
messages=all_messages,
options = {'temperature': temperature})
final_response = response.message.content
# Append the citations to the final response
stack = inspect.stack()
call_fun = stack[-2].function
if call_fun == "ask":
final_response = final_response + f"<br><br>References: <br>{citations}"
else:
final_response = final_response + f"\n\nReferences: \n{citations}"
return final_response

def generate_supplement(final_response,model,temperature,api_key):
# Function to generate a supplement from the final response
# This is used to extract terms from the final response
# Define the context and example for the LLM
context = """Your job is to read the provided paragraph, and note the species, genes, or proteins referenced inside the paragraph.
If they are, you are to list each within its own respective Python list. The following order for the lists should always be used: Taxonomy, then Protein, then Gene.
The genus, species, and subspecies for one organism should constitute one term in the list.
If only the genus is listed, exclude it from the list. The first term in each list must always be the term you are looking for, which in this case is Taxonomy, Gene, or Protein respectively.
If you do not identify any organisms, genes, or proteins, instead return "None" for the category list. Never abbreviate the organism name, gene, or protein name.
If an abbreviated species name is provided, omit it from the output list you are creating. Ignore any cases with a prime symbol in them, such as 'aph(3')-Ia'.
"""
EX3 = """In Escherichia coli (E. coli), several genes are commonly associated with antibiotic resistance,
reflecting the bacterium's ability to evade the effects of various antibiotics.
Notably, the **blaCTX-M**, **blaTEM**, and **blaSHV** genes encode for extended-spectrum
beta-lactamases (ESBLs), which confer resistance to a wide range of beta-lactam antibiotics
(Ahmad, Joji, & Shahid, 2022). Additionally, the **mcr-1** gene is significant for providing
resistance to colistin, a last-resort antibiotic for treating multidrug-resistant infections
(Nasrollahian, Graham, & Halaji, 2024). Other important resistance genes include **aac(3)-Ib-cr**,
which is linked to aminoglycoside resistance, and **qnr** genes that protect against fluoroquinolones
by encoding proteins that shield target enzymes from antibiotic action (Nasrollahian et al., 2024).
Furthermore, the **sul1**, **sul2**, and **sul3** genes are associated with sulfonamide resistance,
while **tetA** and **tetB** are linked to tetracycline resistance (Ribeiro et al., 2023). The presence
of these genes highlights the genetic diversity and complexity of antibiotic resistance mechanisms in
E. coli, emphasizing the need for ongoing surveillance and management strategies to combat this public
health challenge (Silva et al., 2024).
"""
A3 = """[["Taxonomy", "Escherichia coli"],["Gene", "blaCTX-M", "blaTEM", "blaSHV","mcr-1","aac(3)-Ib-cr","qnr","sul1", "sul2","sul3","tetA","tetB"]]"""
# Check the model type and call the appropriate LLM to create the supplement
if model == "ChatGPT-4o-mini":
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", api_key))
supplement = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": context},
{"role": "user","content": EX3},
{"role": "assistant", "content": A3},
{"role": "user", "content": f"The paragraph is: {final_response}"},
],
temperature=temperature,
)
termlist = supplement.choices[0].message.content
elif model == "llama-3.3-70b-versatile":
client = OpenAI(base_url = "https://api.groq.com/openai/v1",api_key=os.environ.get("GROQ_API_KEY", api_key))
supplement = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": context},
{"role": "user","content": EX3},
{"role": "assistant", "content": A3},
{"role": "user", "content": f"The paragraph is: {final_response}"},
],
temperature=temperature)
termlist = supplement.choices[0].message.content
else:
supplement = chat(
model=model,
messages=[
{"role": "system", "content": context},
{"role": "user","content": EX3},
{"role": "assistant", "content": A3},
{"role": "user", "content": f"The paragraph is: {final_response}"},
],
options = {'temperature': temperature})
termlist = supplement.message.content
return termlist

# Copyright Sep 2025 Glen Rogers.
# Subject to MIT license.
147 changes: 147 additions & 0 deletions MicroTraitLLM_VQA/figure_ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import os
import uuid
import json
import base64
from typing import Dict, Any

from openai import OpenAI
from read_api_keys import load_api_keys

# Storage locations
DATA_DIR = "data"
FIGURE_DIR = os.path.join(DATA_DIR, "figures")
FIGURE_INDEX_PATH = os.path.join(DATA_DIR, "figure_index.jsonl")

# Make sure directories exist
os.makedirs(FIGURE_DIR, exist_ok=True)

# OpenAI client for vision + embeddings
_api_keys = load_api_keys("apikeys.txt")
_api_key_openai = _api_keys.get("API_KEY_OPENAI")
client = OpenAI(api_key=_api_key_openai)


def _extract_metadata_from_image(image_bytes: bytes) -> Dict[str, Any]:
"""
Call a multimodal model to extract:
- caption: concise description of the figure
- ocr_text: any text / labels it can read
- chart_data: table-like structure of key numbers, if present
Returns a dict.
"""
if not _api_key_openai:
raise RuntimeError("API_KEY_OPENAI missing in apikeys.txt")

encoded = base64.b64encode(image_bytes).decode("utf-8")
data_url = f"data:image/png;base64,{encoded}"

resp = client.chat.completions.create(
model="gpt-4o-mini",
response_format={"type": "json_object"},
messages=[
{
"role": "system",
"content": (
"You are a vision model analyzing scientific figures. "
"Return a JSON object with fields: "
"`caption` (string), "
"`ocr_text` (string of all readable text), and "
"`chart_data` (an array of rows; each row an object with keys like "
"axis_labels, series, values, units, etc.). "
"If something is missing, use an empty string or empty array."
),
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "Analyze this figure and extract caption, OCR text, and structured chart/table data as JSON.",
},
{"type": "image_url", "image_url": {"url": data_url}},
],
},
],
temperature=0.0,
)

content = resp.choices[0].message.content
try:
metadata = json.loads(content)
except json.JSONDecodeError:
# Fallback: wrap raw content
metadata = {
"caption": content,
"ocr_text": "",
"chart_data": [],
}

# Ensure keys exist
metadata.setdefault("caption", "")
metadata.setdefault("ocr_text", "")
metadata.setdefault("chart_data", [])

return metadata


def _embed_text(text: str) -> list:
"""
Get a text embedding for caption + OCR text.
Used for the text index (and as a stand-in for an image embedding for now).
"""
if not _api_key_openai:
raise RuntimeError("API_KEY_OPENAI missing in apikeys.txt")

resp = client.embeddings.create(
model="text-embedding-3-small",
input=[text],
)
return resp.data[0].embedding


def ingest_figure(image_bytes: bytes, paper_id: str | None = None) -> Dict[str, Any]:
"""
Main ingestion entry point.

1) Run vision model to get metadata (caption, ocr_text, chart_data).
2) Compute embeddings.
3) Save original image to disk.
4) Append a record to figure_index.jsonl.

Returns the stored record, including a generated figure_id.
"""
# Step 1: metadata from vision model
metadata = _extract_metadata_from_image(image_bytes)

# Step 2: embeddings
text_for_embedding = (metadata.get("caption", "") + "\n" +
metadata.get("ocr_text", "")).strip()
text_embedding = _embed_text(text_for_embedding) if text_for_embedding else []

# For now, we'll just reuse text_embedding for image_index as well.
image_embedding = text_embedding

# Step 3: save original image
figure_id = str(uuid.uuid4())
image_filename = f"{figure_id}.png"
image_path = os.path.join(FIGURE_DIR, image_filename)
with open(image_path, "wb") as f:
f.write(image_bytes)

# Step 4: build record
record: Dict[str, Any] = {
"figure_id": figure_id,
"paper_id": paper_id,
"caption": metadata.get("caption", ""),
"ocr_text": metadata.get("ocr_text", ""),
"chart_data": metadata.get("chart_data", []),
"text_embedding": text_embedding,
"image_embedding": image_embedding,
"image_path": image_path,
}

# Append to JSONL index
with open(FIGURE_INDEX_PATH, "a", encoding="utf-8") as f:
f.write(json.dumps(record) + "\n")

return record
Loading