From fa601201ddf6f96cf4e7b338524133cb2f76465b Mon Sep 17 00:00:00 2001 From: Olin Blodgett Date: Fri, 10 Apr 2026 05:41:10 -0400 Subject: [PATCH] refactor: remove Neo4j ID set filtering from curation indexers The curation API is now the authoritative data source. The Neo4j ID cross-reference filters were a legacy gate from when Neo4j was the source of truth. Existing obsolete/internal checks remain as the data quality filter. Removed Neo4j filtering from: - GeneToGeneOrthologyIndexer (objectGene filter) - GeneMolecularInteractionService (interacting genes filter) - GeneGeneticInteractionService (genes + alleles filter) - GenePhenotypeAnnotationService (subject gene filter) - AllelePhenotypeAnnotationService (subject allele filter) - AGMPhenotypeAnnotationService (subject AGM filter) - SiteMapAccessionCurationIndexer (retainAll intersection) - BaseService (all Neo4j repository usage and ID set methods) - BaseInteractionService (hasInteractingGenesInNeo method) --- .../curation/GeneToGeneOrthologyIndexer.java | 18 +--- .../SiteMapAccessionCurationIndexer.java | 7 -- .../AGMPhenotypeAnnotationService.java | 4 +- .../AllelePhenotypeAnnotationService.java | 4 +- .../service/BaseInteractionService.java | 14 --- .../curation/service/BaseService.java | 85 ------------------- .../GeneGeneticInteractionService.java | 34 ++------ .../GeneMolecularInteractionService.java | 18 ++-- .../GenePhenotypeAnnotationService.java | 4 +- 9 files changed, 20 insertions(+), 168 deletions(-) diff --git a/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/GeneToGeneOrthologyIndexer.java b/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/GeneToGeneOrthologyIndexer.java index c6c6ebcb4..a98bd7deb 100644 --- a/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/GeneToGeneOrthologyIndexer.java +++ b/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/GeneToGeneOrthologyIndexer.java @@ -1,6 +1,5 @@ package org.alliancegenome.indexer.indexers.curation; -import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -18,7 +17,6 @@ import org.alliancegenome.exceptional.client.ExceptionCatcher; import org.alliancegenome.indexer.config.IndexerConfig; import org.alliancegenome.indexer.indexers.Indexer; -import org.alliancegenome.indexer.indexers.curation.service.BaseService; import org.apache.commons.collections4.CollectionUtils; import com.fasterxml.jackson.databind.ObjectMapper; @@ -33,7 +31,6 @@ public class GeneToGeneOrthologyIndexer extends Indexer { private final GeneExpressionAnnotationCrudInterface geneExpressionApi = RestProxyFactory.createProxy(GeneExpressionAnnotationCrudInterface.class, ConfigHelper.getCurationApiUrl(), RestConfig.config); private final GeneDiseaseAnnotationCrudInterface geneDiseaseApi = RestProxyFactory.createProxy(GeneDiseaseAnnotationCrudInterface.class, ConfigHelper.getCurationApiUrl(), RestConfig.config); - private Set allNeoGeneIDs; private Set geneExpressionSet; private Set geneAnnotationSet; @@ -45,8 +42,6 @@ public GeneToGeneOrthologyIndexer(IndexerConfig config) { @Override public void index(ProcessDisplayHelper display) { - BaseService baseService = new BaseService(); - allNeoGeneIDs = baseService.getAllNeoGeneIDs(); geneExpressionSet = new HashSet<>(geneExpressionApi.annotatedGeneList().getEntities()); geneAnnotationSet = new HashSet<>(geneDiseaseApi.annotatedGeneList().getEntities()); @@ -99,8 +94,7 @@ protected void startSingleThread(LinkedBlockingDeque queue) { } } - List filteredResults = filterValidResults(results); - indexDocuments(filteredResults); + indexDocuments(results); } catch (Exception e) { log.error("Error while indexing...", e); ExceptionCatcher.report(e); @@ -115,14 +109,4 @@ protected ObjectMapper customizeObjectMapper(ObjectMapper objectMapper) { return RestConfig.config.getJacksonObjectMapperFactory().createObjectMapper(); } - private List filterValidResults(List docs) { - List result = new ArrayList<>(); - for (GeneToGeneOrthologyDocument doc : docs) { - String curie = doc.getGeneToGeneOrthologyGenerated().getObjectGene().getIdentifier(); - if (allNeoGeneIDs.contains(curie)) { - result.add(doc); - } - } - return result; - } } diff --git a/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/SiteMapAccessionCurationIndexer.java b/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/SiteMapAccessionCurationIndexer.java index 6615feea3..de744fe96 100644 --- a/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/SiteMapAccessionCurationIndexer.java +++ b/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/SiteMapAccessionCurationIndexer.java @@ -12,7 +12,6 @@ import org.alliancegenome.exceptional.client.ExceptionCatcher; import org.alliancegenome.indexer.config.IndexerConfig; import org.alliancegenome.indexer.indexers.Indexer; -import org.alliancegenome.indexer.indexers.curation.service.BaseService; import org.alliancegenome.indexer.indexers.document.SiteMapIdDocument; import si.mazi.rescu.RestProxyFactory; @@ -33,10 +32,6 @@ protected void index(ProcessDisplayHelper display) { Map> map = document.getIdsByType(); - BaseService base = new BaseService(); - - map.get("allele").retainAll(base.getAllNeoAlleleIDs()); - List> alleleIdLists = partition(map.get("allele"), 15000); for (int i = 0; i < alleleIdLists.size(); i++) { @@ -49,8 +44,6 @@ protected void index(ProcessDisplayHelper display) { indexDocument(doc); } - map.get("gene").retainAll(base.getAllNeoGeneIDs()); - List> geneIdLists = partition(map.get("gene"), 15000); for (int i = 0; i < geneIdLists.size(); i++) { diff --git a/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/AGMPhenotypeAnnotationService.java b/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/AGMPhenotypeAnnotationService.java index 0c84c94b1..e00ca121d 100644 --- a/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/AGMPhenotypeAnnotationService.java +++ b/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/AGMPhenotypeAnnotationService.java @@ -109,9 +109,7 @@ public void run() { SearchResponse response = agmApi.findForPublic(page, bufferSize, params); for (AGMPhenotypeAnnotation pa : response.getResults()) { - if (isValidNeoEntity(getAllNeoModelIDs(), pa.getPhenotypeAnnotationSubject().getIdentifier())) { - fullList.offer(pa); - } + fullList.offer(pa); display.progressProcess(); } diff --git a/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/AllelePhenotypeAnnotationService.java b/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/AllelePhenotypeAnnotationService.java index 0afe4340e..bd7681158 100644 --- a/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/AllelePhenotypeAnnotationService.java +++ b/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/AllelePhenotypeAnnotationService.java @@ -107,9 +107,7 @@ public void run() { SearchResponse response = alleleApi.findForPublic(page, bufferSize, params); for (AllelePhenotypeAnnotation pa : response.getResults()) { - if (isValidNeoEntity(getAllNeoAlleleIDs(), pa.getPhenotypeAnnotationSubject().getIdentifier())) { - fullList.offer(pa); - } + fullList.offer(pa); display.progressProcess(); } diff --git a/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/BaseInteractionService.java b/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/BaseInteractionService.java index ae129436d..30d2ee2f5 100644 --- a/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/BaseInteractionService.java +++ b/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/BaseInteractionService.java @@ -38,20 +38,6 @@ protected boolean hasNoObsoletedOrInternalEntities(E return hasNoExcludedEntities(entitiesToBeValidated); } - protected boolean hasInteractingGenesInNeo(E interaction) { - if (interaction.getGeneAssociationSubject() != null) { - if (!isValidNeoEntity(getAllNeoGeneIDs(), interaction.getGeneAssociationSubject().getIdentifier())) { - return false; - } - } - if (interaction.getGeneGeneAssociationObject() != null) { - if (!isValidNeoEntity(getAllNeoGeneIDs(), interaction.getGeneGeneAssociationObject().getIdentifier())) { - return false; - } - } - return true; - } - protected E reverseInteraction(E forwardInteraction, E reverseInteraction) { if (forwardInteraction.getGeneAssociationSubject() == null || forwardInteraction.getGeneGeneAssociationObject() == null) { return null; diff --git a/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/BaseService.java b/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/BaseService.java index e43ef4f0b..f9a4a47dd 100644 --- a/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/BaseService.java +++ b/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/BaseService.java @@ -1,16 +1,10 @@ package org.alliancegenome.indexer.indexers.curation.service; import java.io.File; -import java.util.ArrayList; -import java.util.HashSet; import java.util.List; import java.util.concurrent.atomic.AtomicBoolean; import org.alliancegenome.curation_api.model.entities.base.AuditedObject; -import org.alliancegenome.neo4j.repository.AlleleRepository; -import org.alliancegenome.neo4j.repository.GeneRepository; -import org.alliancegenome.neo4j.repository.VariantRepository; -import org.apache.commons.collections4.CollectionUtils; import lombok.extern.log4j.Log4j2; import net.nilosplace.process_display.util.ObjectFileStorage; @@ -18,81 +12,6 @@ @Log4j2 public class BaseService { - private static HashSet allNeoAlleleIDs; - private static HashSet allNeoGeneIDs; - private static HashSet allNeoModelIDs; - private static HashSet allNeoVariantIDs; - - public HashSet getAllNeoAlleleIDs() { - if (allNeoAlleleIDs == null) { - String alleleIdsFileName = "allele_ids.gz"; - List alleleList = readFromCache(alleleIdsFileName, List.class); - - if (CollectionUtils.isNotEmpty(alleleList)) { - allNeoAlleleIDs = new HashSet<>(alleleList); - } else { - AlleleRepository alleleRepository = new AlleleRepository(); - allNeoAlleleIDs = new HashSet<>(alleleRepository.getAllAlleleIDs()); - alleleRepository.close(); - writeToCache(alleleIdsFileName, new ArrayList<>(allNeoAlleleIDs)); - } - } - return allNeoAlleleIDs; - } - - public HashSet getAllNeoGeneIDs() { - - if (allNeoGeneIDs == null) { - String geneIdsFileName = "gene_ids.gz"; - List geneList = readFromCache(geneIdsFileName, List.class); - - if (CollectionUtils.isNotEmpty(geneList)) { - allNeoGeneIDs = new HashSet<>(geneList); - } else { - GeneRepository geneRepository = new GeneRepository(); - allNeoGeneIDs = new HashSet<>(geneRepository.getAllGeneKeys()); - geneRepository.close(); - writeToCache(geneIdsFileName, new ArrayList<>(allNeoGeneIDs)); - } - } - - return allNeoGeneIDs; - } - - public HashSet getAllNeoModelIDs() { - if (allNeoModelIDs == null) { - String modelIdsFileName = "model_ids.gz"; - List modelList = readFromCache(modelIdsFileName, List.class); - - if (CollectionUtils.isNotEmpty(modelList)) { - allNeoModelIDs = new HashSet<>(modelList); - } else { - AlleleRepository alleleRepository = new AlleleRepository(); - allNeoModelIDs = new HashSet<>(alleleRepository.getAllModelKeys()); - alleleRepository.close(); - writeToCache(modelIdsFileName, new ArrayList<>(allNeoModelIDs)); - } - } - return allNeoModelIDs; - } - - public HashSet getAllNeoVariantIDs() { - if (allNeoVariantIDs == null) { - String variantIdsFileName = "variant_ids.gz"; - List variantList = readFromCache(variantIdsFileName, List.class); - - if (CollectionUtils.isNotEmpty(variantList)) { - allNeoVariantIDs = new HashSet<>(variantList); - } else { - VariantRepository variantRepository = new VariantRepository(); - allNeoVariantIDs = new HashSet<>(variantRepository.getAllVariantKeys()); - variantRepository.close(); - writeToCache(variantIdsFileName, new ArrayList<>(allNeoVariantIDs)); - } - } - return allNeoVariantIDs; - } - protected E readFromCache(String fileName, Class clazz) { try { ObjectFileStorage storage = new ObjectFileStorage<>(); @@ -128,8 +47,4 @@ protected boolean hasNoExcludedEntities(List entitiesToBeValidate return hasNoExcludedEntities.get(); } - protected static boolean isValidNeoEntity(HashSet neoEntityIds, String curie) { - return neoEntityIds.contains(curie); - } - } diff --git a/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/GeneGeneticInteractionService.java b/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/GeneGeneticInteractionService.java index 0741b8854..b01a01892 100644 --- a/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/GeneGeneticInteractionService.java +++ b/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/GeneGeneticInteractionService.java @@ -13,19 +13,15 @@ public List getFilteredAndReversedInteractions(List validInteractions = new ArrayList<>(); for (GeneGeneticInteraction interaction: forwardInteractions) { - if (hasPerturbatingAllelesInNeo(interaction)) { - if (hasInteractingGenesInNeo(interaction)) { - if (hasNoObsoletedOrInternalEntities(interaction)) { - validInteractions.add(interaction); - try { - GeneGeneticInteraction reverseInteraction = generateReverseInteraction(interaction); - if (reverseInteraction != null) { - validInteractions.add(reverseInteraction); - } - } catch (IOException e) { - e.printStackTrace(); - } + if (hasNoObsoletedOrInternalEntities(interaction)) { + validInteractions.add(interaction); + try { + GeneGeneticInteraction reverseInteraction = generateReverseInteraction(interaction); + if (reverseInteraction != null) { + validInteractions.add(reverseInteraction); } + } catch (IOException e) { + e.printStackTrace(); } } } @@ -33,20 +29,6 @@ public List getFilteredAndReversedInteractions(List getFilteredAndReversedInteractions(List validInteractions = new ArrayList<>(); for (GeneMolecularInteraction interaction: forwardInteractions) { - if (hasInteractingGenesInNeo(interaction)) { - if (hasNoObsoletedOrInternalEntities(interaction)) { - validInteractions.add(interaction); - try { - GeneMolecularInteraction reverseInteraction = generateReverseInteraction(interaction); - if (reverseInteraction != null) { - validInteractions.add(reverseInteraction); - } - } catch (IOException e) { - e.printStackTrace(); + if (hasNoObsoletedOrInternalEntities(interaction)) { + validInteractions.add(interaction); + try { + GeneMolecularInteraction reverseInteraction = generateReverseInteraction(interaction); + if (reverseInteraction != null) { + validInteractions.add(reverseInteraction); } + } catch (IOException e) { + e.printStackTrace(); } } } diff --git a/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/GenePhenotypeAnnotationService.java b/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/GenePhenotypeAnnotationService.java index fdefdf3d3..ec4c257b1 100644 --- a/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/GenePhenotypeAnnotationService.java +++ b/agr_indexer/src/main/java/org/alliancegenome/indexer/indexers/curation/service/GenePhenotypeAnnotationService.java @@ -108,9 +108,7 @@ public void run() { SearchResponse response = geneApi.findForPublic(page, bufferSize, "PhenotypeAnnotationView", params); for (GenePhenotypeAnnotation pa : response.getResults()) { - if (isValidNeoEntity(getAllNeoGeneIDs(), pa.getPhenotypeAnnotationSubject().getIdentifier())) { - fullList.offer(pa); - } + fullList.offer(pa); display.progressProcess(); }