From 68dea4fd3bb2ca3430f197e6660f7a21757985ae Mon Sep 17 00:00:00 2001 From: markquintontulloch Date: Fri, 13 Sep 2024 23:46:28 +0100 Subject: [PATCH 1/3] Move gene ID to curie map generation --- .../jobs/executors/gff/Gff3CDSExecutor.java | 6 ++-- .../jobs/executors/gff/Gff3ExonExecutor.java | 6 ++-- .../executors/gff/Gff3TranscriptExecutor.java | 11 +++---- .../curation_api/services/Gff3Service.java | 23 ++++++++------- .../helpers/gff3/Gff3AttributesHelper.java | 29 +++++++++++-------- 5 files changed, 40 insertions(+), 35 deletions(-) diff --git a/src/main/java/org/alliancegenome/curation_api/jobs/executors/gff/Gff3CDSExecutor.java b/src/main/java/org/alliancegenome/curation_api/jobs/executors/gff/Gff3CDSExecutor.java index 81fb9c26a..a8466063d 100644 --- a/src/main/java/org/alliancegenome/curation_api/jobs/executors/gff/Gff3CDSExecutor.java +++ b/src/main/java/org/alliancegenome/curation_api/jobs/executors/gff/Gff3CDSExecutor.java @@ -100,8 +100,6 @@ private boolean runLoad( List associationIdsAdded, BackendBulkDataProvider dataProvider, String assemblyId) { - Map geneIdCurieMap = gff3Service.getIdCurieMap(gffData); - ProcessDisplayHelper ph = new ProcessDisplayHelper(); ph.addDisplayHandler(loadProcessDisplayService); ph.startProcess("GFF CDS update for " + dataProvider.name(), gffData.size()); @@ -129,7 +127,7 @@ private boolean runLoad( if (assemblyId != null) { countType = "Locations"; try { - gff3Service.loadCDSLocationAssociations(gff3EntryPair, locationIdsAdded, dataProvider, assemblyId, geneIdCurieMap); + gff3Service.loadCDSLocationAssociations(gff3EntryPair, locationIdsAdded, dataProvider, assemblyId); history.incrementCompleted(countType); } catch (ObjectUpdateException e) { history.incrementFailed(countType); @@ -142,7 +140,7 @@ private boolean runLoad( } countType = "Associations"; try { - gff3Service.loadCDSParentChildAssociations(gff3EntryPair, associationIdsAdded, dataProvider, geneIdCurieMap); + gff3Service.loadCDSParentChildAssociations(gff3EntryPair, associationIdsAdded, dataProvider); history.incrementCompleted(countType); } catch (ObjectUpdateException e) { history.incrementFailed(countType); diff --git a/src/main/java/org/alliancegenome/curation_api/jobs/executors/gff/Gff3ExonExecutor.java b/src/main/java/org/alliancegenome/curation_api/jobs/executors/gff/Gff3ExonExecutor.java index ee613ebd7..74454531e 100644 --- a/src/main/java/org/alliancegenome/curation_api/jobs/executors/gff/Gff3ExonExecutor.java +++ b/src/main/java/org/alliancegenome/curation_api/jobs/executors/gff/Gff3ExonExecutor.java @@ -100,8 +100,6 @@ private boolean runLoad( List associationIdsAdded, BackendBulkDataProvider dataProvider, String assemblyId) { - Map geneIdCurieMap = gff3Service.getIdCurieMap(gffData); - ProcessDisplayHelper ph = new ProcessDisplayHelper(); ph.addDisplayHandler(loadProcessDisplayService); ph.startProcess("GFF Exon update for " + dataProvider.name(), gffData.size()); @@ -131,7 +129,7 @@ private boolean runLoad( if (assemblyId != null) { countType = "Locations"; try { - gff3Service.loadExonLocationAssociations(gff3EntryPair, locationIdsAdded, dataProvider, assemblyId, geneIdCurieMap); + gff3Service.loadExonLocationAssociations(gff3EntryPair, locationIdsAdded, dataProvider, assemblyId); history.incrementCompleted(countType); } catch (ObjectUpdateException e) { history.incrementFailed(countType); @@ -145,7 +143,7 @@ private boolean runLoad( countType = "Associations"; try { - gff3Service.loadExonParentChildAssociations(gff3EntryPair, associationIdsAdded, dataProvider, geneIdCurieMap); + gff3Service.loadExonParentChildAssociations(gff3EntryPair, associationIdsAdded, dataProvider); history.incrementCompleted(countType); } catch (ObjectUpdateException e) { history.incrementFailed(countType); diff --git a/src/main/java/org/alliancegenome/curation_api/jobs/executors/gff/Gff3TranscriptExecutor.java b/src/main/java/org/alliancegenome/curation_api/jobs/executors/gff/Gff3TranscriptExecutor.java index 4f27aaa3f..d08d391d9 100644 --- a/src/main/java/org/alliancegenome/curation_api/jobs/executors/gff/Gff3TranscriptExecutor.java +++ b/src/main/java/org/alliancegenome/curation_api/jobs/executors/gff/Gff3TranscriptExecutor.java @@ -60,6 +60,7 @@ public void execLoad(BulkLoadFileHistory bulkLoadFileHistory) { BackendBulkDataProvider dataProvider = BackendBulkDataProvider.valueOf(fmsLoad.getFmsDataSubType()); List>> preProcessedTranscriptGffData = Gff3AttributesHelper.getTranscriptGffData(gffData, dataProvider); + Map geneIdCurieMap = gff3Service.getGeneIdCurieMap(gffData, dataProvider); gffData.clear(); @@ -73,7 +74,7 @@ public void execLoad(BulkLoadFileHistory bulkLoadFileHistory) { addException(bulkLoadFileHistory, new ObjectUpdateExceptionData(null, "GFF Header does not contain assembly", null)); } - boolean success = runLoad(bulkLoadFileHistory, gffHeaderData, preProcessedTranscriptGffData, entityIdsAdded, locationIdsAdded, associationIdsAdded, dataProvider, assemblyId); + boolean success = runLoad(bulkLoadFileHistory, gffHeaderData, preProcessedTranscriptGffData, geneIdCurieMap, entityIdsAdded, locationIdsAdded, associationIdsAdded, dataProvider, assemblyId); if (success) { runCleanup(transcriptService, bulkLoadFileHistory, dataProvider.name(), transcriptService.getIdsByDataProvider(dataProvider), entityIdsAdded, "GFF transcript"); @@ -93,14 +94,13 @@ private boolean runLoad( BulkLoadFileHistory history, List gffHeaderData, List>> gffData, + Map geneIdCurieMap, List entityIdsAdded, List locationIdsAdded, List associationIdsAdded, BackendBulkDataProvider dataProvider, String assemblyId) { - Map geneIdCurieMap = gff3Service.getIdCurieMap(gffData); - ProcessDisplayHelper ph = new ProcessDisplayHelper(); ph.addDisplayHandler(loadProcessDisplayService); ph.startProcess("GFF Transcript update for " + dataProvider.name(), gffData.size()); @@ -128,7 +128,7 @@ private boolean runLoad( if (assemblyId != null) { countType = "Locations"; try { - gff3Service.loadTranscriptLocationAssociations(gff3EntryPair, locationIdsAdded, dataProvider, assemblyId, geneIdCurieMap); + gff3Service.loadTranscriptLocationAssociations(gff3EntryPair, locationIdsAdded, dataProvider, assemblyId); history.incrementCompleted(countType); } catch (ObjectUpdateException e) { history.incrementFailed(countType); @@ -162,9 +162,10 @@ public APIResponse runLoadApi(String dataProviderName, String assemblyName, List List idsAdded = new ArrayList<>(); BackendBulkDataProvider dataProvider = BackendBulkDataProvider.valueOf(dataProviderName); List>> preProcessedTranscriptGffData = Gff3AttributesHelper.getTranscriptGffData(gffData, dataProvider); + Map geneIdCurieMap = gff3Service.getGeneIdCurieMap(gffData, dataProvider); BulkLoadFileHistory history = new BulkLoadFileHistory(); history = bulkLoadFileHistoryDAO.persist(history); - runLoad(history, null, preProcessedTranscriptGffData, idsAdded, idsAdded, idsAdded, dataProvider, assemblyName); + runLoad(history, null, preProcessedTranscriptGffData, geneIdCurieMap, idsAdded, idsAdded, idsAdded, dataProvider, assemblyName); history.finishLoad(); return new LoadHistoryResponce(history); diff --git a/src/main/java/org/alliancegenome/curation_api/services/Gff3Service.java b/src/main/java/org/alliancegenome/curation_api/services/Gff3Service.java index 35d51a418..4b41cfec2 100644 --- a/src/main/java/org/alliancegenome/curation_api/services/Gff3Service.java +++ b/src/main/java/org/alliancegenome/curation_api/services/Gff3Service.java @@ -30,6 +30,7 @@ import org.alliancegenome.curation_api.services.associations.transcriptAssociations.TranscriptExonAssociationService; import org.alliancegenome.curation_api.services.associations.transcriptAssociations.TranscriptGeneAssociationService; import org.alliancegenome.curation_api.services.associations.transcriptAssociations.TranscriptGenomicLocationAssociationService; +import org.alliancegenome.curation_api.services.helpers.gff3.Gff3AttributesHelper; import org.alliancegenome.curation_api.services.helpers.gff3.Gff3UniqueIdHelper; import org.alliancegenome.curation_api.services.ontology.NcbiTaxonTermService; import org.alliancegenome.curation_api.services.validation.dto.Gff3DtoValidator; @@ -58,7 +59,7 @@ public class Gff3Service { @Inject Gff3DtoValidator gff3DtoValidator; @Transactional - public void loadExonLocationAssociations(ImmutablePair> gffEntryPair, List idsAdded, BackendBulkDataProvider dataProvider, String assemblyId, Map geneIdCurieMap) throws ValidationException { + public void loadExonLocationAssociations(ImmutablePair> gffEntryPair, List idsAdded, BackendBulkDataProvider dataProvider, String assemblyId) throws ValidationException { Gff3DTO gffEntry = gffEntryPair.getKey(); if (StringUtils.isBlank(assemblyId)) { @@ -85,7 +86,7 @@ public void loadExonLocationAssociations(ImmutablePair> gffEntryPair, List idsAdded, BackendBulkDataProvider dataProvider, String assemblyId, Map geneIdCurieMap) throws ValidationException { + public void loadCDSLocationAssociations(ImmutablePair> gffEntryPair, List idsAdded, BackendBulkDataProvider dataProvider, String assemblyId) throws ValidationException { Gff3DTO gffEntry = gffEntryPair.getKey(); Map attributes = gffEntryPair.getValue(); if (StringUtils.isBlank(assemblyId)) { @@ -112,7 +113,7 @@ public void loadCDSLocationAssociations(ImmutablePair> gffEntryPair, List idsAdded, BackendBulkDataProvider dataProvider, String assemblyId, Map geneIdCurieMap) throws ValidationException { + public void loadTranscriptLocationAssociations(ImmutablePair> gffEntryPair, List idsAdded, BackendBulkDataProvider dataProvider, String assemblyId) throws ValidationException { Gff3DTO gffEntry = gffEntryPair.getKey(); Map attributes = gffEntryPair.getValue(); if (StringUtils.isBlank(assemblyId)) { @@ -140,7 +141,7 @@ public void loadTranscriptLocationAssociations(ImmutablePair> gffEntryPair, List idsAdded, BackendBulkDataProvider dataProvider, Map geneIdCurieMap) throws ValidationException { + public void loadExonParentChildAssociations(ImmutablePair> gffEntryPair, List idsAdded, BackendBulkDataProvider dataProvider) throws ValidationException { Gff3DTO gffEntry = gffEntryPair.getKey(); if (!StringUtils.equals(gffEntry.getType(), "exon") && !StringUtils.equals(gffEntry.getType(), "noncoding_exon")) { @@ -164,7 +165,7 @@ public void loadExonParentChildAssociations(ImmutablePair> gffEntryPair, List idsAdded, BackendBulkDataProvider dataProvider, Map geneIdCurieMap) throws ValidationException { + public void loadCDSParentChildAssociations(ImmutablePair> gffEntryPair, List idsAdded, BackendBulkDataProvider dataProvider) throws ValidationException { Gff3DTO gffEntry = gffEntryPair.getKey(); Map attributes = gffEntryPair.getValue(); @@ -211,13 +212,15 @@ public void loadGeneParentChildAssociations(ImmutablePair getIdCurieMap(List>> gffData) { + public Map getGeneIdCurieMap(List gffData, BackendBulkDataProvider dataProvider) { Map geneIdCurieMap = new HashMap<>(); - for (ImmutablePair> gffEntryPair : gffData) { - Map attributes = gffEntryPair.getValue(); - if (attributes.containsKey("ID") && attributes.containsKey("gene_id")) { - geneIdCurieMap.put(attributes.get("ID"), attributes.get("gene_id")); + for (Gff3DTO gffEntry : gffData) { + if (gffEntry.getType().contains("gene")) { + Map attributes = Gff3AttributesHelper.getAttributes(gffEntry, dataProvider); + if (attributes.containsKey("gene_id") && attributes.containsKey("ID")) { + geneIdCurieMap.put(attributes.get("ID"), attributes.get("gene_id")); + } } } diff --git a/src/main/java/org/alliancegenome/curation_api/services/helpers/gff3/Gff3AttributesHelper.java b/src/main/java/org/alliancegenome/curation_api/services/helpers/gff3/Gff3AttributesHelper.java index 2f2c0f51b..4626cd555 100644 --- a/src/main/java/org/alliancegenome/curation_api/services/helpers/gff3/Gff3AttributesHelper.java +++ b/src/main/java/org/alliancegenome/curation_api/services/helpers/gff3/Gff3AttributesHelper.java @@ -101,21 +101,26 @@ public static List>> getTranscriptGff private static void processGffEntry(Gff3DTO originalGffEntry, List>> retGffData, BackendBulkDataProvider dataProvider) { Map attributes = getAttributes(originalGffEntry, dataProvider); - if (attributes.containsKey("Parent") && attributes.get("Parent").indexOf(",") > -1) { - for (String parent : attributes.get("Parent").split(",")) { - HashMap attributesCopy = new HashMap<>(); - attributesCopy.putAll(attributes); - String[] parentIdParts = parent.split(":"); - if (parentIdParts.length == 1) { - parent = dataProvider.name() + ':' + parentIdParts[0]; + if (attributes.containsKey("Parent")) { + if (attributes.get("Parent").indexOf(",") > -1) { + for (String parent : attributes.get("Parent").split(",")) { + if (!parent.endsWith("_transposable_element")) { + HashMap attributesCopy = new HashMap<>(); + attributesCopy.putAll(attributes); + String[] parentIdParts = parent.split(":"); + if (parentIdParts.length == 1) { + parent = dataProvider.name() + ':' + parentIdParts[0]; + } + attributesCopy.put("Parent", parent); + retGffData.add(new ImmutablePair<>(originalGffEntry, attributesCopy)); + } + } + } else { + if (attributes.get("Parent").endsWith("_transposable_element")) { + retGffData.add(new ImmutablePair<>(originalGffEntry, attributes)); } - attributesCopy.put("Parent", parent); - retGffData.add(new ImmutablePair<>(originalGffEntry, attributesCopy)); } - } else { - retGffData.add(new ImmutablePair<>(originalGffEntry, attributes)); } - } } From 0885670cb1b59d2c69d2dcacfb1ae26916c6beac Mon Sep 17 00:00:00 2001 From: markquintontulloch Date: Fri, 13 Sep 2024 23:50:11 +0100 Subject: [PATCH 2/3] Fix boolean logic --- .../services/helpers/gff3/Gff3AttributesHelper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/alliancegenome/curation_api/services/helpers/gff3/Gff3AttributesHelper.java b/src/main/java/org/alliancegenome/curation_api/services/helpers/gff3/Gff3AttributesHelper.java index 4626cd555..d6d3b9bd1 100644 --- a/src/main/java/org/alliancegenome/curation_api/services/helpers/gff3/Gff3AttributesHelper.java +++ b/src/main/java/org/alliancegenome/curation_api/services/helpers/gff3/Gff3AttributesHelper.java @@ -116,7 +116,7 @@ private static void processGffEntry(Gff3DTO originalGffEntry, List(originalGffEntry, attributes)); } } From 8c3a5c3442b5e139a2ca6aafe7da6730954fcd63 Mon Sep 17 00:00:00 2001 From: markquintontulloch Date: Sat, 14 Sep 2024 00:05:31 +0100 Subject: [PATCH 3/3] Return lines without parents for validation --- .../services/helpers/gff3/Gff3AttributesHelper.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/java/org/alliancegenome/curation_api/services/helpers/gff3/Gff3AttributesHelper.java b/src/main/java/org/alliancegenome/curation_api/services/helpers/gff3/Gff3AttributesHelper.java index d6d3b9bd1..7d5872df1 100644 --- a/src/main/java/org/alliancegenome/curation_api/services/helpers/gff3/Gff3AttributesHelper.java +++ b/src/main/java/org/alliancegenome/curation_api/services/helpers/gff3/Gff3AttributesHelper.java @@ -120,6 +120,8 @@ private static void processGffEntry(Gff3DTO originalGffEntry, List(originalGffEntry, attributes)); } } + } else { + retGffData.add(new ImmutablePair<>(originalGffEntry, attributes)); } }