From 18d1f17db51bb8c342b42a0621289bdde3e301e4 Mon Sep 17 00:00:00 2001 From: Neil Flood Date: Sat, 23 Aug 2025 13:40:44 +1000 Subject: [PATCH 1/3] Add explicit support for history in VRT files --- processinghistory/history.py | 76 ++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 26 deletions(-) diff --git a/processinghistory/history.py b/processinghistory/history.py index e7e8520..7401034 100644 --- a/processinghistory/history.py +++ b/processinghistory/history.py @@ -54,6 +54,7 @@ PARENTS_BY_KEY = "parentsByKey" AUTOENVVARSLIST_NAME = "HISTORY_ENVVARS_TO_AUTOINCLUDE" NO_TIMESTAMP = "UnknownTimestamp" +TIMESTAMP = "timestamp" # These GDAL drivers are known to have limits on the size of metadata which # can be stored, and so we need to keep below these, or we lose everything. @@ -70,9 +71,36 @@ def __init__(self): self.metadataByKey = {} self.parentsByKey = {} + def addParentHistory(self, parentfile): + """ + Add history from parent file to self + """ + parentHist = readHistoryFromFile(filename=parentfile) + + if parentHist is not None: + key = (os.path.basename(parentfile), + parentHist.metadataByKey[CURRENTFILE_KEY][TIMESTAMP]) + + # Convert parent's "currentfile" metadata and parentage to normal key entries + self.metadataByKey[key] = parentHist.metadataByKey[CURRENTFILE_KEY] + self.parentsByKey[key] = parentHist.parentsByKey[CURRENTFILE_KEY] + + # Remove those from parentHist + parentHist.metadataByKey.pop(CURRENTFILE_KEY) + parentHist.parentsByKey.pop(CURRENTFILE_KEY) + + # Copy over all the other ancestor metadata and parentage + self.metadataByKey.update(parentHist.metadataByKey) + self.parentsByKey.update(parentHist.parentsByKey) + else: + key = (os.path.basename(parentfile), NO_TIMESTAMP) + + # Add this parent as parent of current file + self.parentsByKey[CURRENTFILE_KEY].append(key) + def toJSON(self): """ - Return a JSON representation of the given ProcessingHistory + Return a JSON representation of the current ProcessingHistory """ d = { METADATA_BY_KEY: {}, @@ -131,7 +159,7 @@ def makeAutomaticFields(): dictn = {} # Time stamp formatted as per ISO 8601 standard, including time zone offset - dictn['timestamp'] = time.strftime("%Y-%m-%d %H:%M:%S%z", time.localtime()) + dictn[TIMESTAMP] = time.strftime("%Y-%m-%d %H:%M:%S%z", time.localtime()) dictn['login'] = getpass.getuser() @@ -239,8 +267,6 @@ def writeHistoryToFile(userDict={}, parents=[], *, filename=None, gdalDS=None): File can be specified as either a filename string or an open GDAL Dataset """ - procHist = makeProcessingHistory(userDict, parents) - if filename is not None: ds = gdal.Open(filename, gdal.GA_Update) else: @@ -250,6 +276,12 @@ def writeHistoryToFile(userDict={}, parents=[], *, filename=None, gdalDS=None): raise ProcessingHistoryError("Must supply either filename or gdalDS") drvrName = ds.GetDriver().ShortName + isVRT = (drvrName == "VRT") + if isVRT and len(parents) > 0: + msg = "History for VRT files should not have parents" + raise ProcessingHistoryError(msg) + + procHist = makeProcessingHistory(userDict, parents) # Convert to JSON procHistJSON = procHist.toJSON() @@ -295,28 +327,7 @@ def makeProcessingHistory(userDict, parents): # Now add history from each parent file procHist.parentsByKey[CURRENTFILE_KEY] = [] for parentfile in parents: - parentHist = readHistoryFromFile(filename=parentfile) - - if parentHist is not None: - key = (os.path.basename(parentfile), - parentHist.metadataByKey[CURRENTFILE_KEY]['timestamp']) - - # Convert parent's "currentfile" metadata and parentage to normal key entries - procHist.metadataByKey[key] = parentHist.metadataByKey[CURRENTFILE_KEY] - procHist.parentsByKey[key] = parentHist.parentsByKey[CURRENTFILE_KEY] - - # Remove those from parentHist - parentHist.metadataByKey.pop(CURRENTFILE_KEY) - parentHist.parentsByKey.pop(CURRENTFILE_KEY) - - # Copy over all the other ancestor metadata and parentage - procHist.metadataByKey.update(parentHist.metadataByKey) - procHist.parentsByKey.update(parentHist.parentsByKey) - else: - key = (os.path.basename(parentfile), NO_TIMESTAMP) - - # Add this parent as parent of current file - procHist.parentsByKey[CURRENTFILE_KEY].append(key) + procHist.addParentHistory(parentfile) return procHist @@ -345,6 +356,19 @@ def readHistoryFromFile(filename=None, gdalDS=None): else: procHist = None + # If this is a VRT, then read the component files as though they were + # parent files + isVRT = (ds.GetDriver().ShortName == "VRT") + if isVRT: + vrtFile = ds.GetDescription() + componentList = [fn for fn in ds.GetFileList() if fn != vrtFile] + for componentFile in componentList: + if not os.path.exists(componentFile): + msg = f"VRT file '{vrtFile}' missing component '{componentFile}'" + raise ProcessingHistoryError(msg) + + procHist.addParentHistory(componentFile) + return procHist From 879c93efcd57e5f39b8adb38cd9fa04f874aad54 Mon Sep 17 00:00:00 2001 From: Neil Flood Date: Sun, 24 Aug 2025 17:26:16 +1000 Subject: [PATCH 2/3] Keep VRT stuff inside guard for no history case --- processinghistory/history.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/processinghistory/history.py b/processinghistory/history.py index 7401034..cddc3d9 100644 --- a/processinghistory/history.py +++ b/processinghistory/history.py @@ -353,22 +353,22 @@ def readHistoryFromFile(filename=None, gdalDS=None): if procHistJSON is not None: procHist = ProcessingHistory.fromJSON(procHistJSON) + + # If this is a VRT, then read the component files as though they were + # parent files + isVRT = (ds.GetDriver().ShortName == "VRT") + if isVRT: + vrtFile = ds.GetDescription() + componentList = [fn for fn in ds.GetFileList() if fn != vrtFile] + for componentFile in componentList: + if not os.path.exists(componentFile): + msg = f"VRT file '{vrtFile}' missing component '{componentFile}'" + raise ProcessingHistoryError(msg) + + procHist.addParentHistory(componentFile) else: procHist = None - # If this is a VRT, then read the component files as though they were - # parent files - isVRT = (ds.GetDriver().ShortName == "VRT") - if isVRT: - vrtFile = ds.GetDescription() - componentList = [fn for fn in ds.GetFileList() if fn != vrtFile] - for componentFile in componentList: - if not os.path.exists(componentFile): - msg = f"VRT file '{vrtFile}' missing component '{componentFile}'" - raise ProcessingHistoryError(msg) - - procHist.addParentHistory(componentFile) - return procHist From 219c2e30a75476769a5add850550957653eccaf1 Mon Sep 17 00:00:00 2001 From: Neil Flood Date: Sun, 24 Aug 2025 19:33:39 +1000 Subject: [PATCH 3/3] Describe VRT special case in the docstring --- processinghistory/history.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/processinghistory/history.py b/processinghistory/history.py index cddc3d9..a492e07 100644 --- a/processinghistory/history.py +++ b/processinghistory/history.py @@ -30,6 +30,15 @@ value being a list of keys of the parents of that file. This dictionary stores all the ancestry relationships for the whole lineage. +History in VRT files +-------------------- +A GDAL VRT file is handled as a somewhat special case. The component files +of the VRT are treated as parents of the VRT (and there can be no other parents), +and the history of those files is read directly from them, rather than being +copied into the VRT. This is handled transparently, so that when history +is read from the VRT, it appears to have all come from there. This allows the +history of the components to be as dynamic as the data itself. + """ import sys import os @@ -362,7 +371,8 @@ def readHistoryFromFile(filename=None, gdalDS=None): componentList = [fn for fn in ds.GetFileList() if fn != vrtFile] for componentFile in componentList: if not os.path.exists(componentFile): - msg = f"VRT file '{vrtFile}' missing component '{componentFile}'" + msg = (f"VRT file '{vrtFile}' missing component " + + f"'{componentFile}'") raise ProcessingHistoryError(msg) procHist.addParentHistory(componentFile)