diff --git a/_includes/themes/lab/paper.html b/_includes/themes/lab/paper.html index 7acb310ce..fb8f87b36 100755 --- a/_includes/themes/lab/paper.html +++ b/_includes/themes/lab/paper.html @@ -160,7 +160,7 @@
{% endif %} diff --git a/assets/images/news/Certificate.jpg b/assets/images/news/Certificate.jpg new file mode 100644 index 000000000..ba87ed47d Binary files /dev/null and b/assets/images/news/Certificate.jpg differ diff --git a/assets/images/news/Certificate.pdf b/assets/images/news/Certificate.pdf new file mode 100644 index 000000000..3990c3b72 Binary files /dev/null and b/assets/images/news/Certificate.pdf differ diff --git a/assets/images/papers/acm.jpg b/assets/images/papers/acm.jpg new file mode 100644 index 000000000..e3c796c00 Binary files /dev/null and b/assets/images/papers/acm.jpg differ diff --git a/assets/images/papers/brain-informatics.png b/assets/images/papers/brain-informatics.png new file mode 100644 index 000000000..7b969bb69 Binary files /dev/null and b/assets/images/papers/brain-informatics.png differ diff --git a/assets/images/papers/methodsx.jpg b/assets/images/papers/methodsx.jpg new file mode 100644 index 000000000..017b68d72 Binary files /dev/null and b/assets/images/papers/methodsx.jpg differ diff --git a/assets/images/papers/neuroinformatics.jpeg b/assets/images/papers/neuroinformatics.jpeg new file mode 100644 index 000000000..fad34b2c9 Binary files /dev/null and b/assets/images/papers/neuroinformatics.jpeg differ diff --git a/assets/images/papers/plos.jpg b/assets/images/papers/plos.jpg new file mode 100644 index 000000000..b6a69c876 Binary files /dev/null and b/assets/images/papers/plos.jpg differ diff --git a/assets/images/projects/2026-02-18-molecules-proteins.png b/assets/images/projects/2026-02-18-molecules-proteins.png new file mode 100644 index 000000000..744e01fb6 Binary files /dev/null and b/assets/images/projects/2026-02-18-molecules-proteins.png differ diff --git a/assets/images/team/Brett_picture.jpg b/assets/images/team/Brett_picture.jpg new file mode 100644 index 000000000..162657e60 Binary files /dev/null and b/assets/images/team/Brett_picture.jpg differ diff --git a/assets/images/team/Group_2019.jpg b/assets/images/team/Group_2019.jpg new file mode 100644 index 000000000..a6a2c437d Binary files /dev/null and b/assets/images/team/Group_2019.jpg differ diff --git a/assets/images/team/Mohammed_Khubaib.png b/assets/images/team/Mohammed_Khubaib.png new file mode 100644 index 000000000..ad467b6a2 Binary files /dev/null and b/assets/images/team/Mohammed_Khubaib.png differ diff --git a/assets/images/team/RobertLoredo_BwHiRes.png b/assets/images/team/RobertLoredo_BwHiRes.png new file mode 100644 index 000000000..f000da512 Binary files /dev/null and b/assets/images/team/RobertLoredo_BwHiRes.png differ diff --git a/assets/images/team/avash_pic.jpg b/assets/images/team/avash_pic.jpg new file mode 100644 index 000000000..7cbfe39bd Binary files /dev/null and b/assets/images/team/avash_pic.jpg differ diff --git a/assets/images/team/gabriel_oliveira.jpg b/assets/images/team/gabriel_oliveira.jpg new file mode 100644 index 000000000..e2d07ee88 Binary files /dev/null and b/assets/images/team/gabriel_oliveira.jpg differ diff --git a/assets/images/team/group_2015.jpg b/assets/images/team/group_2015.jpg new file mode 100644 index 000000000..b92eb1c0b Binary files /dev/null and b/assets/images/team/group_2015.jpg differ diff --git a/assets/images/team/group_2016.jpg b/assets/images/team/group_2016.jpg new file mode 100644 index 000000000..ddda4a38b Binary files /dev/null and b/assets/images/team/group_2016.jpg differ diff --git a/assets/images/team/group_2017.jpg b/assets/images/team/group_2017.jpg new file mode 100644 index 000000000..5b1681bba Binary files /dev/null and b/assets/images/team/group_2017.jpg differ diff --git a/assets/images/team/group_2018.jpg b/assets/images/team/group_2018.jpg new file mode 100644 index 000000000..efc6d34d7 Binary files /dev/null and b/assets/images/team/group_2018.jpg differ diff --git a/assets/pdfs/Avash_Palikhe_CV.pdf b/assets/pdfs/Avash_Palikhe_CV.pdf new file mode 100644 index 000000000..ffe27fe91 Binary files /dev/null and b/assets/pdfs/Avash_Palikhe_CV.pdf differ diff --git a/assets/pdfs/BBrett_CSR.pdf b/assets/pdfs/BBrett_CSR.pdf new file mode 100644 index 000000000..502538a2c Binary files /dev/null and b/assets/pdfs/BBrett_CSR.pdf differ diff --git a/assets/pdfs/CV_Gabriel.pdf b/assets/pdfs/CV_Gabriel.pdf new file mode 100644 index 000000000..4191676fe Binary files /dev/null and b/assets/pdfs/CV_Gabriel.pdf differ diff --git a/assets/pdfs/CV_Robert_Loredo.pdf b/assets/pdfs/CV_Robert_Loredo.pdf new file mode 100644 index 000000000..445d13050 Binary files /dev/null and b/assets/pdfs/CV_Robert_Loredo.pdf differ diff --git a/assets/pdfs/Mohammed_Khubaib_CV.pdf b/assets/pdfs/Mohammed_Khubaib_CV.pdf new file mode 100644 index 000000000..cdd43809b Binary files /dev/null and b/assets/pdfs/Mohammed_Khubaib_CV.pdf differ diff --git a/assets/themes/lab/images/banner/banner-2.png b/assets/themes/lab/images/banner/banner-2.png new file mode 100644 index 000000000..220ad1409 Binary files /dev/null and b/assets/themes/lab/images/banner/banner-2.png differ diff --git a/assets/themes/lab/images/banner/banner-3.png b/assets/themes/lab/images/banner/banner-3.png new file mode 100644 index 000000000..5c546c667 Binary files /dev/null and b/assets/themes/lab/images/banner/banner-3.png differ diff --git a/assets/themes/lab/images/banner/banner-4.png b/assets/themes/lab/images/banner/banner-4.png new file mode 100644 index 000000000..4e7bca1b9 Binary files /dev/null and b/assets/themes/lab/images/banner/banner-4.png differ diff --git a/news/_posts/2026-02-11-Saeed-Seminar.md b/news/_posts/2026-02-11-Saeed-Seminar.md new file mode 100644 index 000000000..3b22efbe4 --- /dev/null +++ b/news/_posts/2026-02-11-Saeed-Seminar.md @@ -0,0 +1,15 @@ +--- +layout: news +title: Invited Distinguished Research Seminar, Ulster University, Ireland UK. +tags: [talks] +image: +excerpt: Prof. Saeed delivers a distinguished invited talk at Ulster University, UK. +--- + +Prof. Saeed delivered a distinguished invited talk entitled “Capturing Proteomic and Neuro Architecture Complexity using Advance Machine Learning Modelling” at the School of Computing, Faculty of Computing, Engineering and the Built Environment, Ulster University at Belfast Campus, Northern Ireland, UK. + +In this talk, Dr. Saeed discussed different challenges and opportunities specific to AI models for biomedical and health data. + + + + diff --git a/papers/_posts/2021-01-01-specollate--deep-cross-modal-similarity-network-for-mass-spectrometry-data-based-peptide-deductions.md b/papers/_posts/2021-01-01-specollate--deep-cross-modal-similarity-network-for-mass-spectrometry-data-based-peptide-deductions.md index 6004ffd6a..179a19bde 100644 --- a/papers/_posts/2021-01-01-specollate--deep-cross-modal-similarity-network-for-mass-spectrometry-data-based-peptide-deductions.md +++ b/papers/_posts/2021-01-01-specollate--deep-cross-modal-similarity-network-for-mass-spectrometry-data-based-peptide-deductions.md @@ -9,7 +9,7 @@ volume: 16 issue: pages: e0259349 is_published: True -image: /assets/images/papers/plos.png +image: /assets/images/papers/plos.jpg projects: [ML-MS] tags: [] diff --git a/papers/_posts/2025-02-02-survey.md b/papers/_posts/2025-02-02-survey.md index 9808c618c..eb5871253 100644 --- a/papers/_posts/2025-02-02-survey.md +++ b/papers/_posts/2025-02-02-survey.md @@ -9,7 +9,7 @@ volume: issue: Vol. 12, article 8 pages: 1-27 is_published: True -image: /assets/images/papers/springer.png +image: /assets/images/papers/brain-informatics.png projects: [ML-ADRD] tags: [journal] diff --git a/papers/_posts/2024-07-22-MLSPred-Bench.md b/papers/_posts/2025-07-22-MLSPred-Bench.md similarity index 77% rename from papers/_posts/2024-07-22-MLSPred-Bench.md rename to papers/_posts/2025-07-22-MLSPred-Bench.md index 2cd4b62ba..b37cece44 100644 --- a/papers/_posts/2024-07-22-MLSPred-Bench.md +++ b/papers/_posts/2025-07-22-MLSPred-Bench.md @@ -1,24 +1,24 @@ --- layout: paper -title: "MLSPred-Bench: ML-Ready Benchmark Leveraging Seizure Detection EEG data for Predictive Models" +title: "MLSPred-Bench: Transforming Electroencephalography (EEG) Datasets into Machine Learning-Ready Seizure Prediction Benchmarks" nickname: MLSPred-Bench-paper authors: "Mohammad, Umair; Saeed, Fahad; " -year: "2024" -journal: "bioRxiv" +year: "2025" +journal: "Elsevier MethodsX" volume: issue: pages: 1-8 -is_published: False -image: /assets/images/papers/biorxiv.png -projects: [ML-seizure] -tags: [preprint] +is_published: True +image: /assets/images/papers/methodsx.jpg +projects: [ML-seizure, MLSPred-Bench] +tags: [ML, EEG, Epilepsy] # Text -fulltext: https://www.biorxiv.org/content/10.1101/2024.07.17.604006v1 +fulltext: https://doi.org/10.1016/j.mex.2025.103574 pdf: -pdflink: https://www.biorxiv.org/content/10.1101/2024.07.17.604006v1.full.pdf +pdflink: pmcid: -preprint: +preprint: https://www.biorxiv.org/content/10.1101/2024.07.17.604006v1.full.pdf supplement: # Links @@ -26,12 +26,12 @@ doi: "10.1101/2024.07.17.604006" pmid: # Data and code -github: [] +github: [https://github.com/pcdslab/MLSPred-Bench] neurovault: openneuro: [] figshare: figshare_names: -osf: +osf: --- {% include JB/setup %} diff --git a/papers/_posts/2025-08-18-Overcoming-Site-Variability.md b/papers/_posts/2025-08-18-Overcoming-Site-Variability.md new file mode 100644 index 000000000..20594dd19 --- /dev/null +++ b/papers/_posts/2025-08-18-Overcoming-Site-Variability.md @@ -0,0 +1,40 @@ +--- +layout: paper +title: "Overcoming Site Variability in Multisite fMRI Studies: An Autoencoder Framework for Enhanced Generalizability of Machine Learning Models" +nickname: AE-Harmonization-paper +authors: "Almuqhim, Fahad; Saeed, Fahad; " +year: "2025" +journal: "Springer Neuroinformatics" +volume: 23 +issue: article 46 +pages: +is_published: True +image: /assets/images/papers/neuroinformatics.jpeg +projects: [ML-brain-imaging] +tags: [ML, AE, ComBat, ASD, Multisite] + +# Text +fulltext: https://link.springer.com/article/10.1007/s12021-025-09746-1 +pdf: +pdflink: https://link.springer.com/content/pdf/10.1007/s12021-025-09746-1.pdf +pmcid: +preprint: +supplement: + +# Links +doi: 10.1007/s12021-025-09746-1 +pmid: + +# Data and code +github: [https://github.com/pcdslab/Autoencoder-fMRI-Harmonization] +neurovault: +openneuro: [] +figshare: +figshare_names: +osf: [d8253] +--- +{% include JB/setup %} + +# Abstract + +Harmonizing multisite functional magnetic resonance imaging (fMRI) data is crucial for eliminating site-specific variability that hinders the generalizability of machine learning models. Traditional harmonization techniques, such as ComBat, depend on additive and multiplicative factors, and may struggle to capture the non-linear interactions between scanner hardware, acquisition protocols, and signal variations between different imaging sites. In addition, these statistical techniques require data from all the sites during their model training which may have the unintended consequence of data leakage for ML models trained using this harmonized data. The ML models trained using this harmonized data may result in low reliability and reproducibility when tested on unseen data sets, limiting their applicability for general clinical usage. In this study, we propose Autoencoders (AEs) as an alternative for harmonizing multisite fMRI data. Our designed and developed framework leverages the non-linear representation learning capabilities of AEs to reduce site-specific effects while preserving biologically meaningful features. Our evaluation using Autism Brain Imaging Data Exchange I (ABIDE-I) dataset, containing 1,035 subjects collected from 17 centers demonstrates statistically significant improvements in leave-one-site-out (LOSO) cross-validation evaluations. All AE variants (AE, SAE, TAE, and DAE) significantly outperformed the baseline mode (p<0.01), with mean accuracy improvements ranging from 3.41% to 5.04%. Our findings demonstrate the potential of AEs to harmonize multisite neuroimaging data effectively enabling robust downstream analyses across various neuroscience applications while reducing data-leakage, and preservation of neurobiological features. Our open-source code is made available at https://github.com/pcdslab/Autoencoder-fMRI-Harmonization \ No newline at end of file diff --git a/papers/_posts/2025-12-1-Raptor.md b/papers/_posts/2025-12-1-Raptor.md new file mode 100644 index 000000000..0e843a268 --- /dev/null +++ b/papers/_posts/2025-12-1-Raptor.md @@ -0,0 +1,40 @@ +--- +layout: paper +title: "RAPTOR: Reconfigurable Advanced Platform for Trans- disciplinary Open Research" +nickname: Raptor-colab-paper +authors: "Hamed Najafi; Pratik Poudel; Kiavash Bahreini; Julio Ibarra; Fahad Saeed; Yuepeng Li; Jayantha Obeysekera; Jason Liu;" +year: "2025" +journal: "Proceedings of 15th Workshop on AI and Scientific Computing at Scale using Flexible Computing Infrastructures (FlexScience)" +volume: +issue: Article 50 +pages: 1 - 5 +is_published: True +image: /assets/images/papers/acm.jpg +projects: [HPC-MS] +tags: [ML, AE, ComBat, ASD, Multisite] + +# Text +fulltext: https://dl.acm.org/doi/pdf/10.1145/3731545.3744665 +pdf: +pdflink: +pmcid: +preprint: +supplement: + +# Links +doi: 10.1145/3731545.3744665 +pmid: + +# Data and code +github: [] +neurovault: +openneuro: [] +figshare: +figshare_names: +osf: [] +--- +{% include JB/setup %} + +# Abstract + +Scientific research is increasingly relying on complex workflows that span multiple computing paradigms, including high-performance computing (HPC), high-throughput computing (HTC), and machine learning/artificial intelligence (ML/AI). Traditional monolithic computing infrastructures often struggle to accommodate these diverse and evolving demands. The Reconfigurable Advanced Platform for Transdisciplinary Open Research (RAPTOR) addresses this challenge by providing a dynamically reconfigurable computing environment that integrates with federated resources. RAPTOR's architecture enables dynamic provisioning between an HPC cluster and the Chameleon Cloud platform based on workload requirements, supporting bare-metal customization for specialized applications. This paper focuses on RAPTOR's reconfigurability features and demonstrates their effectiveness through quantitative performance evaluations across four scientific domains: computational proteomics, climate modeling, weather research, and hurricane risk assessment. Our results demonstrate that RAPTOR's reconfigurable design significantly enhances research productivity by providing an appropriate computing environment for diverse computational needs. diff --git a/papers/_posts/2025-12-1-fairGNN-WOD.md b/papers/_posts/2025-12-1-fairGNN-WOD.md new file mode 100644 index 000000000..b2ec52b64 --- /dev/null +++ b/papers/_posts/2025-12-1-fairGNN-WOD.md @@ -0,0 +1,40 @@ +--- +layout: paper +title: "fairGNN-WOD: fair graph learning without demographics" +nickname: fairGNN-colab-paper +authors: "Zichong Wang; Fang Liu; Shimei Pan; Jun Liu; Fahad Saeed; Meikang Qiu; Wenbin Zhang;" +year: "2025" +journal: "IJCAI '25: Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence" +volume: +issue: Article 63 +pages: 556 - 564 +is_published: True +image: /assets/images/papers/acm.jpg +projects: [ML-brain-imaging] +tags: [ML, ASD, ADRD, Multisite] + +# Text +fulltext: +pdf: +pdflink: https://www.ijcai.org/proceedings/2025/0063.pdf +pmcid: +preprint: +supplement: + +# Links +doi: 10.24963/ijcai.2025/63 +pmid: + +# Data and code +github: [] +neurovault: +openneuro: [] +figshare: +figshare_names: +osf: [] +--- +{% include JB/setup %} + +# Abstract + +Graph Neural Networks (GNNs) have excelled in diverse applications due to their outstanding predictive performance, yet they often overlook fairness considerations, prompting numerous recent efforts to address this societal concern. However, most fair GNNs assume complete demographics by design, which is impractical in most real-world socially sensitive applications due to privacy, legal, or regulatory restrictions. For example, the Consumer Financial Protection Bureau (CFPB) mandates that creditors ensure fairness without requesting or collecting information about an applicant's race, religion, nationality, sex, or other demographics. To this end, this paper proposes fairGNN-WOD, a first-of-its-kind framework that considers mitigating unfairness in graph learning without using demographic information. In addition, this paper provides a theoretical perspective on analyzing bias in node representations and establishes the relationship between utility and fairness objectives. Experiments on three real-world graph datasets illustrate that fairGNN-WOD outperforms state-of-the-art baselines in achieving fairness but also maintains comparable prediction performance. \ No newline at end of file diff --git a/papers/_posts/2026-01-01-systems-and-methods-for-pred-seizures.md b/papers/_posts/2026-01-01-systems-and-methods-for-pred-seizures.md new file mode 100644 index 000000000..7dd2a72b3 --- /dev/null +++ b/papers/_posts/2026-01-01-systems-and-methods-for-pred-seizures.md @@ -0,0 +1,40 @@ +--- +layout: paper +title: "Systems and methods for patient-specific epileptic seizure prediction" +nickname: seizure-patent +authors: "Mohammad, Umair; Saeed, Fahad;" +year: "2026" +journal: "US Patent US-12544002-B2" +volume: +issue: +pages: +is_published: True +image: /assets/images/papers/uspto.png +projects: [ML-seizure] +tags: [patent] + +# Text +fulltext: https://ppubs.uspto.gov/api/patents/html/12544002?source=USPAT&requestToken=eyJzdWIiOiI1ODY2Y2I2YS02ZDc5LTRmMTAtOWI2Ni05YTk3ZjgzNjA5MGEiLCJ2ZXIiOiI0ODcwYTRlMi00YThhLTQ4NzQtOTIyOC01NTE0NjRmNGY0ZmYiLCJleHAiOjB9 +pdf: https://ppubs.uspto.gov/api/pdf/downloadPdf/12544002?requestToken=eyJzdWIiOiI1ODY2Y2I2YS02ZDc5LTRmMTAtOWI2Ni05YTk3ZjgzNjA5MGEiLCJ2ZXIiOiI0ODcwYTRlMi00YThhLTQ4NzQtOTIyOC01NTE0NjRmNGY0ZmYiLCJleHAiOjB9 +pdflink: +pmcid: +preprint: +supplement: + +# Links +doi: "" +pmid: + +# Data and code +github: [] +neurovault: +openneuro: [] +figshare: +figshare_names: +osf: +--- +{% include JB/setup %} + +# Abstract + +A patient specific epileptic seizure(ES) prediction model using only electroencephalography (EEG) data with residual neural networks (ResNets) and transfer learning (TL) techniques (i.e., SPERTL) is provided. One exemplary provided model was trained on EEG data from 23 patients with a seizure prediction horizon (SPH) of 5 minutes and used the validation data to plot precision-recall curves to aid in selecting preferred thresholds. Testing on unseen data shows the provided model outperforms related art methods by achieving the highest average sensitivity of 88.1%, specificity of 92.3%, and accuracy of 92.3%. Results also demonstrate the proposed model is less susceptible to false positives while maintaining a high positive prediction rate. diff --git a/papers/_posts/2026-02-18-FiCOPS.md b/papers/_posts/2026-02-18-FiCOPS.md new file mode 100644 index 000000000..d5791642d --- /dev/null +++ b/papers/_posts/2026-02-18-FiCOPS.md @@ -0,0 +1,40 @@ +--- +layout: paper +title: "FiCOPS: Hardware and Software Co-Design of FPGA Computational Framework for Mass Spectrometry-Based Peptide Database Search" +nickname: FiCOPS-paper +authors: "Kumar, Sumesh; Zambreno, Joseph; Khokhar, Ashfaq; Akram, Shoaib; Saeed, Fahad;" +year: "2026" +journal: +volume: +issue: +pages: +is_published: False +image: /assets/images/papers/biorxiv.png +projects: [HPC-MS] +tags: [preprint] + +# Text +fulltext: https://www.biorxiv.org/content/10.64898/2026.02.15.706012v1 +pdf: +pdflink: https://www.biorxiv.org/content/10.64898/2026.02.15.706012v1.full.pdf +pmcid: +preprint: +supplement: + +# Links +doi: https://doi.org/10.64898/2026.02.15.706012 +pmid: + +# Data and code +github: [] +neurovault: +openneuro: [] +figshare: +figshare_names: +osf: [] +--- +{% include JB/setup %} + +# Abstract + +Improving the speed and efficiency of database search algorithms that deduce peptides from mass spectrometry (MS) data has been an active area of research for more than three decades. The significance of the need for faster database search methods has rapidly increased due to the growing interest in studying non-model organisms, meta-proteomics, and proteogenomic data, which are notorious for their enormous search space. Poor scalability of serial algorithms with the growing size of the database and increasing parameters of post-translational modifications is a widely recognized problem. While high-performance computing techniques can be used on supercomputing machines, the need for real-time, on-the-instrument solutions necessitates the development of an efficient system-on-chip that optimizes design constraints such as cost, performance, and power of the system. To show case that such a system can work, we present an FPGA-based computational framework called FiCOPS to accelerate database search using a hardware/software co-design methodology. First, we theoretically analyze the database-search algorithm (closed-search) to reveal opportunities for parallelism and uncover computational bottlenecks. We then design an FPGA-based architectural template to exploit parallelism inherent in the search workload. We also formulate an analytical performance model for the architecture template to perform rapid design space exploration and find a near-optimal accelerator configuration. Finally, we implement our design on the Intel Stratix 10 FPGA platform and evaluate it using real-world datasets. Our experiments demonstrate that FiCOPS achieves 3.5 times speed-up over existing CPU solutions and 3 times and 5 times reduction in power consumption compared to existing CPU and GPU solutions. diff --git a/papers/_posts/2026-02-18-MolDeBERTa.md b/papers/_posts/2026-02-18-MolDeBERTa.md new file mode 100644 index 000000000..0afef53cd --- /dev/null +++ b/papers/_posts/2026-02-18-MolDeBERTa.md @@ -0,0 +1,40 @@ +--- +layout: paper +title: "MolDeBERTa: Foundational Model for Physicochemical and Structural-Informed Molecular Representation Learning" +nickname: MolDeBERTa-paper +authors: "Oliveira, Gabriel Bianchin; Saeed, Fahad;" +year: "2026" +journal: +volume: +issue: +pages: +is_published: False +image: /assets/images/papers/biorxiv.png +projects: [ML-molecular-protein-representation] +tags: [preprint] + +# Text +fulltext: https://www.biorxiv.org/content/10.64898/2026.02.15.706011v1 +pdf: +pdflink: https://www.biorxiv.org/content/10.64898/2026.02.15.706011v1.full.pdf +pmcid: +preprint: +supplement: + +# Links +doi: https://doi.org/10.64898/2026.02.15.706011 +pmid: + +# Data and code +github: [https://github.com/pcdslab/MolDeBERTa] +neurovault: +openneuro: [] +figshare: +figshare_names: +osf: [] +--- +{% include JB/setup %} + +# Abstract + +Foundational models that learn the language of molecules are essential for accelerating the material and drug discovery. These self-learning models can be trained on a large number of unlabelled molecules, enabling applications like property prediction, molecule de-sign (de novo generation, optimization), and screening for specific functions. However, existing molecular language models are built upon first-generation transformer architectures and are pretrained using masked language modeling, a generic token-level objective that is agnostic to physicochemical and structural molecular properties. Here we introduce MolDe-BERTa, a structure-informed self-supervised molecular encoder that leverages a byte-level Byte-Pair Encoding (BPE) tokenization strategy. MolDeBERTa is pretrained on up to 123 million SMILES molecules from PubChem, representing one of the largest publicly available SMILES-based corpora. To achieve this, we introduce three novel pretraining objectives designed to inject strong inductive biases for molecular properties and structural similarity directly into the latent space, resulting in reduced gap between linguistic chemical representations and physical molecular properties. The model was then systematically investigated across three architectural scales, two dataset sizes, and five distinct pretraining objectives. MolDeBERTa when evaluated on 9 downstream MoleculeNet benchmarks outperformed existing masked language models, achieving up to a 16% reduction in regression error and improvements of up to 3.0 ROC-AUC points on classification benchmarks. MolDeBERTa advances unsupervised encoder-based foundational models at scale both for pretraining data and downstream evaluation, enabling data-efficient chemistry-informed representation learning. The source code is publicly available at https://github.com/pcdslab/MolDeBERTa, and Hugging Face at https://huggingface.co/collections/SaeedLab/moldeberta. All the pretraining datasets are available at https://huggingface.co/datasets/SaeedLab/MolDeBERTa. \ No newline at end of file diff --git a/papers/_posts/2026-02-18-TITAN-BBB.md b/papers/_posts/2026-02-18-TITAN-BBB.md new file mode 100644 index 000000000..6b132b3ee --- /dev/null +++ b/papers/_posts/2026-02-18-TITAN-BBB.md @@ -0,0 +1,40 @@ +--- +layout: paper +title: "TITAN-BBB: Predicting BBB Permeability using Multi-Modal Deep-Learning Models" +nickname: TITAN-BBB-paper +authors: "Oliveira, Gabriel Bianchin; Saeed, Fahad;" +year: "2026" +journal: +volume: +issue: +pages: +is_published: False +image: /assets/images/papers/biorxiv.png +projects: [ML-molecular-protein-representation] +tags: [preprint] + +# Text +fulltext: https://www.biorxiv.org/content/10.64898/2026.02.15.706007v1 +pdf: +pdflink: https://www.biorxiv.org/content/10.64898/2026.02.15.706007v1.full.pdf +pmcid: +preprint: +supplement: + +# Links +doi: https://doi.org/10.64898/2026.02.15.706007 +pmid: + +# Data and code +github: [https://github.com/pcdslab/TITAN-BBB] +neurovault: +openneuro: [] +figshare: +figshare_names: +osf: [] +--- +{% include JB/setup %} + +# Abstract + +Computational prediction of blood-brain barrier (BBB) permeability has emerged as a vital alternative to traditional experimental assays, which are often resource-intensive and low-throughput to meet the demands of early-stage drug discovery. While early machine learn-ing approaches have shown promise, integration of traditional chemical descriptors with deep learning embeddings remains an underexplored frontier. In this paper, we introduce TITAN-BBB, a multi-modal deep-learning architecture that utilizes tabular, image, and text-based features and combines them using attention mechanisms. To evaluate, we aggregated multiple literature sources to create the largest BBB permeability dataset to date, enabling robust training for both classification and regression tasks. Our results demonstrate that TITAN-BBB achieves 86.5% of balanced accuracy on classification tasks and 0.436 of mean absolute error for regression, outperforming the state-of-the-art by 3.1 percentage points in balanced accuracy and reducing the regression error by 20%. Our approach also outperforms state-of-the-art models in both classification and regression performance, demonstrating the benefits of combining deep and domain-specific representations. The source code is publicly available at https://github.com/pcdslab/TITAN-BBB. The inference-ready model is hosted on Hugging Face at https://huggingface.co/SaeedLab/TITAN-BBB, and the aggregated BBB permeability datasets are available at https://huggingface.co/datasets/SaeedLab/BBBP. \ No newline at end of file diff --git a/papers/_posts/2026-02-23-dom-formula-assignment.md b/papers/_posts/2026-02-23-dom-formula-assignment.md new file mode 100644 index 000000000..54d48fe22 --- /dev/null +++ b/papers/_posts/2026-02-23-dom-formula-assignment.md @@ -0,0 +1,40 @@ +--- +layout: paper +title: "A Machine Learning and Benchmarking Approach for Molecular Formula Assignment of Ultra High-Resolution Mass Spectrometry Data from Complex Mixtures" +nickname: dom-formula-assignment-paper +authors: "Shabbir, Bilal; Oliveira, Pablo R B; Fernandez-Lima, Francisco; Saeed, Fahad;" +year: "2026" +journal: +volume: +issue: +pages: +is_published: False +image: /assets/images/papers/biorxiv.png +projects: [ML-MS] +tags: [preprint] + +# Text +fulltext: https://www.biorxiv.org/content/10.64898/2026.02.17.706479v1 +pdf: +pdflink: https://www.biorxiv.org/content/10.64898/2026.02.17.706479v1.full.pdf +pmcid: +preprint: +supplement: + +# Links +doi: https://doi.org/10.64898/2026.02.17.706479 +pmid: + +# Data and code +github: [https://github.com/pcdslab/dom-formula-assignment-using-ml] +neurovault: +openneuro: [] +figshare: +figshare_names: +osf: [] +--- +{% include JB/setup %} + +# Abstract + +A machine learning approach to molecular formula assignment is crucial for unlocking the full potential of ultra-high resolution mass spectrometry (UHRMS) when analyzing complex mixtures. By combining data-driven models with rigorous benchmarking, the accuracy, consistency, and speed in identifying plausible molecular formulas from vast spectral datasets can be improved. Compared with traditional de novo methods that rely heavily on rule-based heuristics, and manual parameter tuning, machine learning approaches can capture complex patterns in data and adapt more readily to diverse sample types. In this paper, we describe the application of a machine learning methods using the k-nearest neighbors (KNN) algorithm trained on curated chemical formula datasets of UHRMS analysis of dissolved organic matter (DOM) covering the saline river continuum and tropical wet/dry season variability. The influence of the mass accuracy (training set with 0.15-1ppm) was evaluated on a blind test set of DOMs of different geographical origins. A Decision Tree Regressor (DTR) and Random Forest Regressor (RFR) based on mass accuracy (<1ppm) was used. Results from our ML models exhibit 43% more formulas annotated than traditional methods (5796 vs 4047), Model-Synthetic achieved 99.9% assignment rate and annotated/assigned 2x more formulas (8,268 vs 4047). DTR and RFR achieved formula-level accuracies (FA) of 86.5% and 60.4%, respectively. Overall, results show an increase in formula assignment when compared with traditional methods. This ultimately enables more reliable characterization of complex natural and engineered systems, supporting advances in fields such as environmental science, metabolomics, and petroleomics. Furthermore, the novel data set produced for this study is made publicly available, establishing an initial benchmark for molecular formula assignment in UHRMS using machine learning. The dataset and code are publicly available at: https://github.com/pcdslab/dom-formula-assignment-using-ml. \ No newline at end of file diff --git a/projects/_posts/2024-06-22-MLSPred-Bench.md b/projects/_posts/2024-06-22-MLSPred-Bench.md index d79173084..a6177443f 100644 --- a/projects/_posts/2024-06-22-MLSPred-Bench.md +++ b/projects/_posts/2024-06-22-MLSPred-Bench.md @@ -28,6 +28,19 @@ MLSPred-Bench will create 12 different benchmarks based on different values of t For each benchmark, MLSPred-Bench draws preictal segments of length from the SPH duration. We assume there is a gap equal to the SOP in minutes before the start of a seizure where the SPH ends. The datasets are class-balanced where an equal amount of interictal samples are drawn from sessions of the same subject where there were no seizures. +We designed and developed a method called MLSPred-Bench that can be used for converting any EEG big data annotated for detection into ML-ready data suitable for prediction. We apply our methods to the existing EEG data corpus to generate 12 ML-ready benchmarks comprising data for training, validating, and testing seizure prediction models. Our strategy uses different variations of seizure prediction horizon (SPH) and the seizure occurrence period (SOP) to produce more than 150GB of ML-ready data. We hope that the generated benchmarking data will be utilized by various computational groups for their seizure prediction model development. + +Send an email at fsaeed@fiu.edu if you want to get pre-processed MLSPred-Bench data. + +The work can be summarized as follows: +1. Extract short preictal and interictal segments from long-duration annotated EEG montages. +2. Generate a comprehensive list of ML-ready benchmarks with varying SPH and SOP. +3. Technically validate the generated data with multiple ML and DL models with up-to 88.73% validation accuracy +4. Opensource code and related materials are available at https://github.com/pcdslab/MLSPred-Bench. + + + + ## Participate Data collection and curation for this study is complete. diff --git a/projects/_posts/2026-02-18-ML-molecules-proteins.md b/projects/_posts/2026-02-18-ML-molecules-proteins.md new file mode 100644 index 000000000..51291e55c --- /dev/null +++ b/projects/_posts/2026-02-18-ML-molecules-proteins.md @@ -0,0 +1,31 @@ +--- +layout: project +title: "Molecular and Protein Representation Learning" +contributors: [Prof-S,gabrielbianchin] +handle: ML-molecular-protein-representation +status: analysis +type: Method Development + +# Optional +website: +grant: [{id: NIH}] +image: /assets/images/projects/2026-02-18-molecules-proteins.png +tagline: 'Machine learning and foundation models for molecular and protein representation learning and biomedical applications' +tags: [molecules,proteins] + +# Data and code +github: [] +neurovault: +openneuro: +figshare: +figshare_names: +osf: +--- +{% include JB/setup %} + +This project focuses on developing advanced machine learning and foundational models for molecular and protein representation learning. The research aims to integrate physicochemical properties, structural information, and dynamic biological signals into scalable deep learning architectures. By leveraging large-scale pretraining, self-supervised learning, and multimodal modeling, the project seeks to bridge the gap between data-driven representations and real-world biological functionality. The resulting models enable a wide range of applications, including molecular property prediction, drug discovery, protein function annotation, and biomedical data analysis. Ultimately, this research aims to advance AI-driven approaches that can improve our understanding of biological systems and accelerate translational applications in precision medicine and therapeutics. + + +## Methods + +Method development for this work is ongoing. diff --git a/projects/index.html b/projects/index.html index 09fc0c6a1..8877ed819 100644 --- a/projects/index.html +++ b/projects/index.html @@ -51,6 +51,12 @@ +  +
+
+
+
+
|
+ Postdoc and MS/PhD Students + |
+
+ Year of graduation + |
+
+ Focus area + |
+
+ First Position + |
+
+ Last known Position + |
+
| + + | +
+ 2025 + |
+
+ ML for Neuroscience + |
+
+ Union College, Schenectady, N.Y. (TT Faculty) + |
+
+ Union College, Schenectady, N.Y. (TT Faculty) + |
+
| + + | +
+ 2023 + |
+
+ ML for Proteomics + |
+
+ |
+
+ Facebook (ML scientist) + |
+
| + + | +
+ 2023 + |
+
+ ML for Neuroscience + |
+
+ Postdoc FIU + |
+
+ Postdoc FIU + |
+
| + + | +
+ 2023 + |
+
+ HPC for Proteomics + |
+
+ UC Berkeley + |
+
+ NVIDIA (HPC Software Engineer) + |
+
| + + | +
+ 2020 + |
+
+ ML for Neuroscience + |
+
+ Zeiss + |
+
+ Apple Inc (ML Engineer) + |
+
| + + | +
+ 2019 + |
+
+ HPC for proteomics + |
+
+ UC Berkeley + |
+
+ Berkeley Labs (staff scientist) + |
+
| + + | +
+ 2017 + |
+
+ High performance Computing + |
+
+ Kalamazoo College + |
+
+ Kalamazoo College (TT Faculty) + |
+
| + + | +
+ 2017 + |
+
+ Machine-learning for biology + |
+
+ Kennesaw state university (TT faculty) + |
+
+ University of North Texas (TT Faculty) + |
+
| + + | +
+ 2017 + |
+
+ Complex Networks + |
+
+ Ohio State University (Postdoc) + |
+
+ Franklin University, Ohio (TT Faculty) + |
+