From 41314cdc9849a19d1370943e422a33318664f82d Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 25 Mar 2022 21:43:24 -0400 Subject: [PATCH 01/22] A functional testVino.py file now exists I figured out the data format for `vinoPCA.Data` --- .idea/.gitignore | 8 ++ .idea/PyVino.iml | 8 ++ .idea/inspectionProfiles/Project_Default.xml | 8 ++ .../inspectionProfiles/profiles_settings.xml | 6 ++ .idea/misc.xml | 4 + .idea/modules.xml | 8 ++ .idea/vcs.xml | 6 ++ testVino.py | 90 +++++++++++++++++++ vino.py | 3 +- 9 files changed, 140 insertions(+), 1 deletion(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/PyVino.iml create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 testVino.py diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/PyVino.iml b/.idea/PyVino.iml new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ b/.idea/PyVino.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..1b700c1 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,8 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..d1e22ec --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..d2c2de9 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/testVino.py b/testVino.py new file mode 100644 index 0000000..9b9d76d --- /dev/null +++ b/testVino.py @@ -0,0 +1,90 @@ +import unittest +import numpy as np +from dcclab import Database +import os +from ramandb import RamanDB +import requests +import matplotlib.pyplot as plt +from vino import vinoPCA + +class TestVInoClass(unittest.TestCase): + @unittest.skip("NOt now") + def testInit(self): + iterable = [31, 30, 30, 30, 80, 31, 33, 31, 30, 30, 30, 30, 30, 30, 30, 30, 104, 30, 30] # sans vin blanc parceque ça shit le aspect ratio + total = sum(iterable) + + # Data = np.genfromtxt('/Users/Shooshoo/PycharmProjects/PCA_DCCLab/DataVino_Sorted.csv', delimiter=',') + db = RamanDB() + data, labels = db.getIntensities() + wavelengths = db.getWavelengths() + data = np.cat(wavelengths, wavelengths, data[:,0:total]) + self.assertEqual(data.shape[1], total) + my_Spectrums = vinoPCA(data, iterable) + + self.assertIsNotNone(my_Spectrums) + + def testRemoveFluo(self): + iterable = [31, 30, 30, 30, 80, 31, 33, 31, 30, 30, 30, 30, 30, 30, 30, 30, 104, 30, 30] # sans vin blanc parceque ça shit le aspect ratio + total = sum(iterable) + + # I need to remove this function, I don't have access to the csv file. + # Data = np.genfromtxt('/Users/Shooshoo/PycharmProjects/PCA_DCCLab/DataVino_Sorted.csv', delimiter=',') + # After a bit of playing around: column 0 is not used, column 1 is the wavelengths, then its + # the data + db = RamanDB() + data, labels = db.getIntensities() + wavelengths = db.getWavelengths() + wavelengths = np.expand_dims(wavelengths, 1) + + data = np.concatenate( (wavelengths, wavelengths, data[:,0:total]), axis=1 ) + # self.assertEqual(data.shape[1], total) + my_Spectrums = vinoPCA(data, iterable) + + self.assertIsNotNone(my_Spectrums) + + my_Spectrums.removeFLuo(my_Spectrums.Data) + + + def testDoPCA(self): + iterable = [31, 30, 30, 30, 80, 31, 33, 31, 30, 30, 30, 30, 30, 30, 30, 30, 104, 30, 30] # sans vin blanc parceque ça shit le aspect ratio + total = sum(iterable) + + # Data = np.genfromtxt('/Users/Shooshoo/PycharmProjects/PCA_DCCLab/DataVino_Sorted.csv', delimiter=',') + db = RamanDB() + data, labels = db.getIntensities() + wavelengths = db.getWavelengths() + wavelengths = np.expand_dims(wavelengths, 1) + + data = np.concatenate( (wavelengths, wavelengths, data[:,0:total]), axis=1 ) + # self.assertEqual(data.shape[1], total) + my_Spectrums = vinoPCA(data, iterable) + + self.assertIsNotNone(my_Spectrums) + + my_Spectrums.doPCA(10) + my_Spectrums.showTransformedData3D() + my_Spectrums.showTransformedData2D() + my_Spectrums.showEigenvectors() + + # def testInitDB(self): + # self.assertIsNotNone(vinoPCA().db) + + # def testColormap(self): + # vino = vinoPCA() + # cm = vino.getColorMap() + # self.assertIsNotNone(cm) + # spectra, labels = vino.db.getIntensities() + # self.assertEqual(len(cm), len(labels)) + + # def testOneSpectrum(self): + # vino = vinoPCA() + # spectra, labels = vino.db.getIntensities() + # plt.plot(spectra[:,1]) + # newSpectra = vino.removeFLuo(spectra) + # print(newSpectra) + # # plt.plot(newSpectra) + # # plt.show() + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/vino.py b/vino.py index 2f40e0f..f731edd 100644 --- a/vino.py +++ b/vino.py @@ -44,7 +44,7 @@ def removeFLuo(self, Data): nm = Data[:, 1] cm = 1 / (632.8e-9) - 1 / (nm * 1e-9) size = np.ma.size(Data, 1) - polynomial_degree = 100 + polynomial_degree = 5 filtered_datas = np.zeros(shape=(800, size - 1)) # for column in range(2, size): @@ -79,6 +79,7 @@ def doPCA(self, n:int): """ new_Datas = self.removeFLuo(self.Data) + # new_Datas = self.Data[:,0:-1] new_Datas = np.transpose(new_Datas) self.X_PCA = PCA(n_components=n) self.X_reduced = self.X_PCA.fit_transform(new_Datas[1:, :]) From fb2b74bfb6f7ec6186c6ee58810a270294aa7e7d Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 25 Mar 2022 22:08:45 -0400 Subject: [PATCH 02/22] Rewrote init without arguments to use database instead Working, but we need to flush the 'T' wines for the rest of the code to work. Bad code. --- testVino.py | 9 +++++++++ vino.py | 25 +++++++++++++++++++++---- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/testVino.py b/testVino.py index 9b9d76d..367277f 100644 --- a/testVino.py +++ b/testVino.py @@ -66,6 +66,15 @@ def testDoPCA(self): my_Spectrums.showTransformedData2D() my_Spectrums.showEigenvectors() + def testvinoPCANoArgument(self): + my_Spectrums = vinoPCA() + self.assertIsNotNone(my_Spectrums) + + my_Spectrums.doPCA(10) + my_Spectrums.showTransformedData3D() + my_Spectrums.showTransformedData2D() + my_Spectrums.showEigenvectors() + # def testInitDB(self): # self.assertIsNotNone(vinoPCA().db) diff --git a/vino.py b/vino.py index f731edd..9b45985 100644 --- a/vino.py +++ b/vino.py @@ -4,19 +4,36 @@ from sklearn.decomposition import PCA from scipy import interpolate from BaselineRemoval import BaselineRemoval - +from ramandb import RamanDB class vinoPCA: - def __init__(self, Data, numberOfEachSamples): + def __init__(self, Data=None, numberOfEachSamples=None): """ :param Data: The data on wich PCA should be done. :param colormap: An iterable that contains how many of each samples there is in Data, in the good order. """ - self.Data = Data - self.numberOfEachSamples = numberOfEachSamples + if Data is None: + self.db = RamanDB() + self.db.execute("select count(*) as count, substr(path,16,1) as id from files where id != 'T' group by id order by id") + records = self.db.fetchAll() + numberOfEachSamples = [] + for record in records: + numberOfEachSamples.append(record["count"]) + # iterable = [31, 30, 30, 30, 80, 31, 33, 31, 30, 30, 30, 30, 30, 30, 30, 30, 104, 30, + # 30] # sans vin blanc parceque ça shit le aspect ratio + total = sum(numberOfEachSamples) + + data, labels = self.db.getIntensities() + wavelengths = self.db.getWavelengths() + wavelengths = np.expand_dims(wavelengths, 1) + self.Data = np.concatenate((wavelengths, wavelengths, data[:, 0:total]), axis=1) + self.numberOfEachSamples = numberOfEachSamples + else: + self.Data = Data + self.numberOfEachSamples = numberOfEachSamples def getColorMap(self): From fdcc20a8fa2b343cdae101a7e8f0eca142d2a1f4 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 25 Mar 2022 22:18:10 -0400 Subject: [PATCH 03/22] Update vino.py Not using files anymore. --- vino.py | 52 +++++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/vino.py b/vino.py index 9b45985..65af387 100644 --- a/vino.py +++ b/vino.py @@ -15,25 +15,21 @@ def __init__(self, Data=None, numberOfEachSamples=None): :param colormap: An iterable that contains how many of each samples there is in Data, in the good order. """ - if Data is None: - self.db = RamanDB() - self.db.execute("select count(*) as count, substr(path,16,1) as id from files where id != 'T' group by id order by id") - records = self.db.fetchAll() - numberOfEachSamples = [] - for record in records: - numberOfEachSamples.append(record["count"]) - # iterable = [31, 30, 30, 30, 80, 31, 33, 31, 30, 30, 30, 30, 30, 30, 30, 30, 104, 30, - # 30] # sans vin blanc parceque ça shit le aspect ratio - total = sum(numberOfEachSamples) - - data, labels = self.db.getIntensities() - wavelengths = self.db.getWavelengths() - wavelengths = np.expand_dims(wavelengths, 1) - self.Data = np.concatenate((wavelengths, wavelengths, data[:, 0:total]), axis=1) - self.numberOfEachSamples = numberOfEachSamples - else: - self.Data = Data - self.numberOfEachSamples = numberOfEachSamples + self.db = RamanDB() + self.db.execute("select count(*) as count, substr(path,16,1) as id from files where id != 'T' group by id order by id") + records = self.db.fetchAll() + numberOfEachSamples = [] + for record in records: + numberOfEachSamples.append(record["count"]) + # iterable = [31, 30, 30, 30, 80, 31, 33, 31, 30, 30, 30, 30, 30, 30, 30, 30, 104, 30, + # 30] # sans vin blanc parceque ça shit le aspect ratio + total = sum(numberOfEachSamples) + + data, labels = self.db.getIntensities() + wavelengths = self.db.getWavelengths() + wavelengths = np.expand_dims(wavelengths, 1) + self.Data = np.concatenate((wavelengths, wavelengths, data[:, 0:total]), axis=1) + self.numberOfEachSamples = numberOfEachSamples def getColorMap(self): @@ -42,13 +38,19 @@ def getColorMap(self): :return: Return a colormap to visualise different samples on the plot. """ - for i in range(0, len(self.numberOfEachSamples)): - if i == 0: - colormap = np.zeros(self.numberOfEachSamples[0]) - else: - colormap = np.append(colormap, np.ones(self.numberOfEachSamples[i]) *5*i) + spectra, labels = self.db.getIntensities() - return colormap + uniqueLabelsInOrder = sorted(set(labels)) + possibleColorsInOrder = range(len(uniqueLabelsInOrder)) + colors = {} + for identifier, color in zip(uniqueLabelsInOrder, possibleColorsInOrder): + colors[identifier] = color*5 + + colormap = [] + for identifier in labels: + colormap.append(colors[identifier]) + + return np.array(colormap[0:700]) def removeFLuo(self, Data): From 4a9afbb85ea0fab035fd596d6c21d31dc901db20 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 25 Mar 2022 22:24:50 -0400 Subject: [PATCH 04/22] Removing self.Data references to construct from database --- testVino.py | 12 ++++++------ vino.py | 13 +++++-------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/testVino.py b/testVino.py index 367277f..5223799 100644 --- a/testVino.py +++ b/testVino.py @@ -78,12 +78,12 @@ def testvinoPCANoArgument(self): # def testInitDB(self): # self.assertIsNotNone(vinoPCA().db) - # def testColormap(self): - # vino = vinoPCA() - # cm = vino.getColorMap() - # self.assertIsNotNone(cm) - # spectra, labels = vino.db.getIntensities() - # self.assertEqual(len(cm), len(labels)) + def testColormap(self): + vino = vinoPCA() + cm = vino.getColorMap() + self.assertIsNotNone(cm) + spectra, labels = vino.db.getIntensities() + self.assertEqual(len(cm), len(labels)-9) # def testOneSpectrum(self): # vino = vinoPCA() diff --git a/vino.py b/vino.py index 65af387..7ca74b6 100644 --- a/vino.py +++ b/vino.py @@ -21,15 +21,10 @@ def __init__(self, Data=None, numberOfEachSamples=None): numberOfEachSamples = [] for record in records: numberOfEachSamples.append(record["count"]) - # iterable = [31, 30, 30, 30, 80, 31, 33, 31, 30, 30, 30, 30, 30, 30, 30, 30, 104, 30, - # 30] # sans vin blanc parceque ça shit le aspect ratio total = sum(numberOfEachSamples) - data, labels = self.db.getIntensities() - wavelengths = self.db.getWavelengths() - wavelengths = np.expand_dims(wavelengths, 1) - self.Data = np.concatenate((wavelengths, wavelengths, data[:, 0:total]), axis=1) - self.numberOfEachSamples = numberOfEachSamples + self.data, self.labels = self.db.getIntensities() + self.wavelengths = self.db.getWavelengths() def getColorMap(self): @@ -96,8 +91,10 @@ def doPCA(self, n:int): :param n: number of componants to get from the PCA :return: Returns nothing. Just creats an array of the transformed datas into the new vector space """ + wavelengths = np.expand_dims(self.wavelengths, 1) + Data = np.concatenate((wavelengths, wavelengths, self.data[:, 0:total]), axis=1) - new_Datas = self.removeFLuo(self.Data) + new_Datas = self.removeFLuo(Data) # new_Datas = self.Data[:,0:-1] new_Datas = np.transpose(new_Datas) self.X_PCA = PCA(n_components=n) From d1815a4bd14c49e58a78f02a9ed813c60115d549 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 25 Mar 2022 22:33:13 -0400 Subject: [PATCH 05/22] Removed useless code in __init__, modified colormap --- testVino.py | 2 +- vino.py | 16 ++++------------ 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/testVino.py b/testVino.py index 5223799..d86e73a 100644 --- a/testVino.py +++ b/testVino.py @@ -83,7 +83,7 @@ def testColormap(self): cm = vino.getColorMap() self.assertIsNotNone(cm) spectra, labels = vino.db.getIntensities() - self.assertEqual(len(cm), len(labels)-9) + self.assertEqual(len(cm), len(labels)) # def testOneSpectrum(self): # vino = vinoPCA() diff --git a/vino.py b/vino.py index 7ca74b6..89196e6 100644 --- a/vino.py +++ b/vino.py @@ -16,12 +16,6 @@ def __init__(self, Data=None, numberOfEachSamples=None): """ self.db = RamanDB() - self.db.execute("select count(*) as count, substr(path,16,1) as id from files where id != 'T' group by id order by id") - records = self.db.fetchAll() - numberOfEachSamples = [] - for record in records: - numberOfEachSamples.append(record["count"]) - total = sum(numberOfEachSamples) self.data, self.labels = self.db.getIntensities() self.wavelengths = self.db.getWavelengths() @@ -33,19 +27,17 @@ def getColorMap(self): :return: Return a colormap to visualise different samples on the plot. """ - spectra, labels = self.db.getIntensities() - - uniqueLabelsInOrder = sorted(set(labels)) + uniqueLabelsInOrder = sorted(set(self.labels)) possibleColorsInOrder = range(len(uniqueLabelsInOrder)) colors = {} for identifier, color in zip(uniqueLabelsInOrder, possibleColorsInOrder): colors[identifier] = color*5 colormap = [] - for identifier in labels: + for identifier in self.labels: colormap.append(colors[identifier]) - return np.array(colormap[0:700]) + return np.array(colormap) def removeFLuo(self, Data): @@ -92,7 +84,7 @@ def doPCA(self, n:int): :return: Returns nothing. Just creats an array of the transformed datas into the new vector space """ wavelengths = np.expand_dims(self.wavelengths, 1) - Data = np.concatenate((wavelengths, wavelengths, self.data[:, 0:total]), axis=1) + Data = np.concatenate((wavelengths, wavelengths, self.data[:, 0:700]), axis=1) new_Datas = self.removeFLuo(Data) # new_Datas = self.Data[:,0:-1] From c7d3e943c1e0fd9fa8ee0b2d5535b7c97961a84b Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 25 Mar 2022 23:07:06 -0400 Subject: [PATCH 06/22] Successfully modified removeFluo, no more database --- testVino.py | 2 +- vino.py | 55 ++++++++++++++++++----------------------------------- 2 files changed, 19 insertions(+), 38 deletions(-) diff --git a/testVino.py b/testVino.py index d86e73a..9c84701 100644 --- a/testVino.py +++ b/testVino.py @@ -70,7 +70,7 @@ def testvinoPCANoArgument(self): my_Spectrums = vinoPCA() self.assertIsNotNone(my_Spectrums) - my_Spectrums.doPCA(10) + my_Spectrums.doPCA(3) my_Spectrums.showTransformedData3D() my_Spectrums.showTransformedData2D() my_Spectrums.showEigenvectors() diff --git a/vino.py b/vino.py index 89196e6..aa05031 100644 --- a/vino.py +++ b/vino.py @@ -20,6 +20,11 @@ def __init__(self, Data=None, numberOfEachSamples=None): self.data, self.labels = self.db.getIntensities() self.wavelengths = self.db.getWavelengths() + # TO BE REMOVED + self.data = self.data[200:1000, 0:700] + self.labels = self.labels[0:700] + self.wavelengths = self.wavelengths[200:1000] + def getColorMap(self): """ @@ -39,7 +44,7 @@ def getColorMap(self): return np.array(colormap) - def removeFLuo(self, Data): + def removeFLuo(self, Data=None): """ Remove fluorescence background from the data given. @@ -47,34 +52,13 @@ def removeFLuo(self, Data): :return: A new set of Data without the background. """ - nm = Data[:, 1] - cm = 1 / (632.8e-9) - 1 / (nm * 1e-9) - size = np.ma.size(Data, 1) polynomial_degree = 5 - filtered_datas = np.zeros(shape=(800, size - 1)) - - # for column in range(2, size): - # y = Data[:, column] - # d = 25 - # f2 = interpolate.interp1d(cm[199:][::d], y[199:][::d], kind='quadratic') - # y = y[200:1000] - f2(cm[200:1000]) - # y = (y - min(y)) / max(y - min(y)) - # filt_datas[:, column - 1] = y - # filt_datas[:, 0] = cm[200:1000] - - for column in range(2, size): - spectre = Data[200:1000, column] - baseObj = BaselineRemoval(spectre) - values = baseObj.IModPoly(polynomial_degree) - # values = values - min(values) # Si tu normalises, tu perds les composants communs (Alcool particulèrement) - # values = values/max(values) # tu perds aussi le degrés de présence (Plus ou moins bouchonné ?) - # Si tu normalises pas, tu favorises les composants communs présents à - # différents degrés (Plus ou moins d'alcool). Donc tester avec et sans? - filtered_datas[:, column - 1] = values - - filtered_datas[:, 0] = Data[200:1000, 1] - - return filtered_datas + correctedSpectra = np.empty_like(self.data) + for i in range(self.data.shape[1]): + spectre = self.data[:, i] + correctedSpectra[:, i] = BaselineRemoval(spectre).IModPoly(polynomial_degree) + + return correctedSpectra def doPCA(self, n:int): @@ -83,14 +67,11 @@ def doPCA(self, n:int): :param n: number of componants to get from the PCA :return: Returns nothing. Just creats an array of the transformed datas into the new vector space """ - wavelengths = np.expand_dims(self.wavelengths, 1) - Data = np.concatenate((wavelengths, wavelengths, self.data[:, 0:700]), axis=1) - - new_Datas = self.removeFLuo(Data) + new_Datas = self.removeFLuo() # new_Datas = self.Data[:,0:-1] new_Datas = np.transpose(new_Datas) self.X_PCA = PCA(n_components=n) - self.X_reduced = self.X_PCA.fit_transform(new_Datas[1:, :]) + self.X_reduced = self.X_PCA.fit_transform(new_Datas) def showTransformedData3D(self): @@ -103,9 +84,9 @@ def showTransformedData3D(self): fig = plt.figure(1, figsize=(8, 6)) ax = Axes3D(fig, elev=-150, azim=110) ax.scatter( - self.X_reduced[:700, 0], - self.X_reduced[:700, 1], - self.X_reduced[:700, 2], + self.X_reduced[:, 0], + self.X_reduced[:, 1], + self.X_reduced[:, 2], c=self.getColorMap(), cmap='nipy_spectral', s=10) @@ -127,7 +108,7 @@ def showTransformedData2D(self): plt.clf() plt.figure(2) - plt.scatter(self.X_reduced[:700, 0], self.X_reduced[:700, 1], c=self.getColorMap(), cmap='nipy_spectral', s=10) + plt.scatter(self.X_reduced[:, 0], self.X_reduced[:, 1], c=self.getColorMap(), cmap='nipy_spectral', s=10) plt.title('First two PCA directions') plt.xlabel('1st eigenvector') plt.ylabel('2nd eigenvector') From dc1427cd327b641dc0edc30d40062ecc33c632db Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 25 Mar 2022 23:21:45 -0400 Subject: [PATCH 07/22] MOre modifications to make it more general. wavelength range is now part of the calss properties --- vino.py | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/vino.py b/vino.py index aa05031..413f58a 100644 --- a/vino.py +++ b/vino.py @@ -8,7 +8,7 @@ class vinoPCA: - def __init__(self, Data=None, numberOfEachSamples=None): + def __init__(self): """ :param Data: The data on wich PCA should be done. @@ -20,10 +20,9 @@ def __init__(self, Data=None, numberOfEachSamples=None): self.data, self.labels = self.db.getIntensities() self.wavelengths = self.db.getWavelengths() - # TO BE REMOVED - self.data = self.data[200:1000, 0:700] - self.labels = self.labels[0:700] - self.wavelengths = self.wavelengths[200:1000] + self.wavelengthMask = range(200, 1000) + self.data = self.data[self.wavelengthMask, :] + self.wavelengths = self.wavelengths[self.wavelengthMask] def getColorMap(self): @@ -44,12 +43,11 @@ def getColorMap(self): return np.array(colormap) - def removeFLuo(self, Data=None): + def subtractFluorescence(self): """ - Remove fluorescence background from the data given. - :param Data: The Data from witch you wish to remove fluo background. - :return: A new set of Data without the background. + Remove fluorescence background from the data. + :return: A corrected data without the background. """ polynomial_degree = 5 @@ -67,11 +65,9 @@ def doPCA(self, n:int): :param n: number of componants to get from the PCA :return: Returns nothing. Just creats an array of the transformed datas into the new vector space """ - new_Datas = self.removeFLuo() - # new_Datas = self.Data[:,0:-1] - new_Datas = np.transpose(new_Datas) - self.X_PCA = PCA(n_components=n) - self.X_reduced = self.X_PCA.fit_transform(new_Datas) + self.pca = PCA(n_components=n) + correctedData = self.subtractFluorescence() + self.X_reduced = self.pca.fit_transform(correctedData.T) def showTransformedData3D(self): @@ -128,7 +124,7 @@ def getAllEigenvectors(self): :return: an array of n eigenvector """ - return self.X_PCA.components_.transpose() + return self.pca.components_.transpose() def showEigenvectors(self): @@ -138,13 +134,13 @@ def showEigenvectors(self): """ plt.figure(3) plt.title('1st eigenvector') - plt.plot(self.X_PCA.components_.transpose()[:, 0]) + plt.plot(self.pca.components_.transpose()[:, 0]) plt.figure(4) plt.title('2nd eigenvector') - plt.plot(self.X_PCA.components_.transpose()[:, 1]) + plt.plot(self.pca.components_.transpose()[:, 1]) plt.figure(5) plt.title('3rd eigenvector') - plt.plot(self.X_PCA.components_.transpose()[:, 2]) + plt.plot(self.pca.components_.transpose()[:, 2]) plt.show() def getTransformedDatas(self): @@ -163,7 +159,7 @@ def getScreeValues(self): :return: array of the scree values, from most important to least """ - return self.X_PCA.explained_variance_ratio_ + return self.pca.explained_variance_ratio_ def plotScreeValues(self): From b6093a3a016de91f5ef3bbcdc7b90814fd24e9b4 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 26 Mar 2022 16:07:17 -0400 Subject: [PATCH 08/22] Many changes to database schema I have now defined spectrumId, which corresponds to the string "wineId-sampleId". We can now store the results of calculations in the spectra table and keep track of the algorithm used. Currently, raw or 'fluorescence-corrected' --- .idea/dataSources.xml | 17 +++++++ ramandb.py | 107 ++++++++++++++++++++++++++++++++++++++---- testDatabase.py | 43 +++++++++++++++-- testVino.py | 24 ++-------- vino.py | 10 +--- wines.txt | 27 +++++++++++ 6 files changed, 185 insertions(+), 43 deletions(-) create mode 100644 .idea/dataSources.xml create mode 100644 wines.txt diff --git a/.idea/dataSources.xml b/.idea/dataSources.xml new file mode 100644 index 0000000..5b2b08f --- /dev/null +++ b/.idea/dataSources.xml @@ -0,0 +1,17 @@ + + + + + sqlite.xerial + true + org.sqlite.JDBC + jdbc:sqlite:$PROJECT_DIR$/raman.db + $ProjectFileDir$ + + + file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.34.0/sqlite-jdbc-3.34.0.jar + + + + + \ No newline at end of file diff --git a/ramandb.py b/ramandb.py index 52ed268..c168d39 100644 --- a/ramandb.py +++ b/ramandb.py @@ -1,12 +1,15 @@ from dcclab.database import * import numpy as np import requests +from BaselineRemoval import BaselineRemoval class RamanDB(Database): url = 'https://www.dropbox.com/s/peowchyj7xyib4w/raman.db?dl=1' def __init__(self, writePermission=False): """ Creates the database object for Raman spectra. + + The information as entered by people is here: https://docs.google.com/spreadsheets/d/1CgXRyIr7q3P26GP8Km4r9LuLH5o2APFj-4B72niTI1g/edit#gid=0 """ self.databasePath = "raman.db" @@ -19,8 +22,33 @@ def __init__(self, writePermission=False): self._wavelengths = None self.progressStart = None + self.constraints = [] super().__init__(self.databasePath, writePermission=writePermission) + def showHelp(self): + print(""" + All wines obtained from the group are in this database. Things to know: + * Wines are identified with a "wineId" that is A,B,C, .... AA, AB, AC, .... etc. + * Each wine has a number a spectrum acquisitions associated with it (typically 30, 60, etc...) + * When a Raman spectrum is acquired + + """) + + def execute(self, statement, bindings=None): + """ + This function with "bindings" is necessary to handle binary data: it cannot be inserted with a string statement. + The bindings are explained here: https://zetcode.com/db/sqlitepythontutorial/ and are similar to .format() + but are handled properly by the sqlite3 module instead of a python string. Without it, binary data + is inserted as a string, which is not good. + + See insertFileContentIntoSources() for an example. + + """ + if bindings is None: + super().execute(statement) # Call the original function from dcclab.database + else: + self.cursor.execute(statement, bindings) + def downloadDatabase(self): r = requests.get(self.url, allow_redirects=True) filename = "raman-download.db" @@ -46,15 +74,40 @@ def getWavelengths(self): return wavelengths + def getDataTypes(self): + self.execute('select dataType from spectra group by dataType') + rows = self.fetchAll() + dataTypes = [] + for row in rows: + dataTypes.append(row["dataType"]) - def getCountFiles(self): + return dataTypes + + def getIdentifiers(self): + self.execute(r"select count(*) as count, wineId as id from files group by wineId order by wineId;") + rows = self.fetchAll() + identifiers = {} + for row in rows: + id = row["id"] + nSamples = row["count"] + identifiers[id] = nSamples + return identifiers + + def getWinesSummary(self): + self.execute(r"select files.wineId, count(*) as nSamples, wines.* from files inner join wines on wines.wineId = files.wineId group by files.wineId order by files.wineId") + rows = self.fetchAll() + wines = [] + for row in rows: + wines.append(dict(row)) + return wines + + def getFileCount(self): self.execute(r"select count(*) as count from files") rows = self.fetchAll() if rows is None: return 0 return rows[0]["count"] - def getSpectraPaths(self): self.execute("select path from files order by path") rows = self.fetchAll() @@ -64,10 +117,23 @@ def getSpectraPaths(self): return paths def getIntensities(self, limit=None): + return self.getSpectraWithWineId(limit=limit) + + def getSpectraWithId(self, dataType=None, limit=None): + possibleDataTypes = self.getDataTypes() + + if dataType is None: + dataType = 'raw' + + if dataType not in possibleDataTypes: + raise ValueError('Possible dataTypes are {0}'.format(possibleDataTypes)) + stmnt = """ - select wavelength, intensity, files.path from spectra - inner join files on files.fid = spectra.fid - order by files.path, wavelength """ + select wavelength, intensity, spectra.spectrumId, wines.* from spectra + inner join files on files.spectrumId = spectra.spectrumId + inner join wines on wines.wineId = files.wineId + where dataType = '{0}' + order by files.path, wavelength """.format(dataType) wavelengths = self.getWavelengths() nWavelengths = len(wavelengths) @@ -86,14 +152,35 @@ def getIntensities(self, limit=None): return None spectra = np.zeros(shape=(nWavelengths, nSamples)) - wineIdentifiers = [""]*nSamples + spectrumIdentifiers = [""]*nSamples for i,row in enumerate(rows): spectra[i%nWavelengths, i//nWavelengths] = float(row['intensity']) - match = re.search(r"([A-Z]+)_?\d+.txt", row["path"]) - if match is not None: - wineIdentifiers[i//nWavelengths] = match.group(1) + spectrumIdentifiers[i//nWavelengths] = row['spectrumId'] + + return spectra, spectrumIdentifiers + + def storeCorrectedSpectra(self): + spectra, spectrumIds = self.getSpectraWithId() + correctedSpectra = self.subtractFluorescence(spectra) + for i in range( correctedSpectra.shape[1]): + spectrumId = spectrumIds[i] + print("Running for spectrum {0}".format(spectrumId)) + for x,y in zip(self.wavelengths, correctedSpectra[:,i]): + self.execute("insert into spectra (wavelength, intensity, spectrumId, dataType, algorithm, dateAdded) values(?, ?, ?, 'fluorescence-corrected', 'BaselineRemoval-degree5', datetime())", (x,y, spectrumId)) + + def subtractFluorescence(self, rawSpectra, polynomialDegree=5): + + """ + Remove fluorescence background from the data. + :return: A corrected data without the background. + """ + + correctedSpectra = np.empty_like(rawSpectra) + for i in range(rawSpectra.shape[1]): + spectrum = rawSpectra[:, i] + correctedSpectra[:, i] = BaselineRemoval(spectrum).IModPoly(polynomialDegree) - return spectra, wineIdentifiers + return correctedSpectra def showProgressBar(self, iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"): """ diff --git a/testDatabase.py b/testDatabase.py index 2d2bdca..02f2295 100644 --- a/testDatabase.py +++ b/testDatabase.py @@ -23,19 +23,19 @@ def testWavelengthsProperty(self): def testFileCount(self): db = RamanDB() - self.assertIsNotNone(db.getCountFiles()) - self.assertEqual(db.getCountFiles(), 709) + self.assertIsNotNone(db.getFileCount()) + self.assertEqual(db.getFileCount(), 709) def testFilePaths(self): db = RamanDB() self.assertIsNotNone(db.getSpectraPaths()) - self.assertEqual(db.getCountFiles(), len(db.getSpectraPaths())) + self.assertEqual(db.getFileCount(), len(db.getSpectraPaths())) def testGetIntensity(self): db = RamanDB() matrix, labels = db.getIntensities() self.assertIsNotNone(matrix) - self.assertEqual(matrix.shape, (len(db.wavelengths), db.getCountFiles())) + self.assertEqual(matrix.shape, (len(db.wavelengths), db.getFileCount())) @unittest.skip("Ok, tested") def testDownload(self): @@ -65,6 +65,41 @@ def testAddFileIdToDatabase(self): statement = "update spectra set fid={0} where md5='{1}'".format(record["fid"], record["md5"]) db.execute(statement) + @unittest.skip("done") + def testBuildWineIdAndSampleId(self): + db.execute('update files set sampleId=substr(path,18,2) where path like "%\_%" ESCAPE "\"') + + def testWineIdentifiers(self): + db = RamanDB() + print(db.getIdentifiers()) + + def testWinesSummary(self): + db = RamanDB() + wineSummary = db.getWinesSummary() + print(wineSummary) + + def testStoreCorrectedSpectra(self): + db = RamanDB(writePermission=False) + db.storeCorrectedSpectra() + + def testDataTypes(self): + db = RamanDB(writePermission=False) + print(db.getDataTypes()) + + def testGetSpectraValidType(self): + db = RamanDB(writePermission=False) + spectra, spectrumIds = db.getSpectraWithId(dataType='raw') + self.assertIsNotNone(spectra) + + def testGetSpectraValidTypeFluorescence(self): + db = RamanDB(writePermission=False) + spectra, spectrumIds = db.getSpectraWithId(dataType='fluorescence-corrected') + self.assertIsNotNone(spectra) + + def testGetSpectraInvalidType(self): + db = RamanDB(writePermission=False) + with self.assertRaises(ValueError): + spectra = db.getSpectraWithId(dataType='unknown') if __name__ == "__main__": unittest.main() \ No newline at end of file diff --git a/testVino.py b/testVino.py index 9c84701..934185c 100644 --- a/testVino.py +++ b/testVino.py @@ -31,33 +31,15 @@ def testRemoveFluo(self): # Data = np.genfromtxt('/Users/Shooshoo/PycharmProjects/PCA_DCCLab/DataVino_Sorted.csv', delimiter=',') # After a bit of playing around: column 0 is not used, column 1 is the wavelengths, then its # the data - db = RamanDB() - data, labels = db.getIntensities() - wavelengths = db.getWavelengths() - wavelengths = np.expand_dims(wavelengths, 1) - - data = np.concatenate( (wavelengths, wavelengths, data[:,0:total]), axis=1 ) - # self.assertEqual(data.shape[1], total) - my_Spectrums = vinoPCA(data, iterable) + my_Spectrums = vinoPCA() self.assertIsNotNone(my_Spectrums) - my_Spectrums.removeFLuo(my_Spectrums.Data) + my_Spectrums.subtractFluorescence() def testDoPCA(self): - iterable = [31, 30, 30, 30, 80, 31, 33, 31, 30, 30, 30, 30, 30, 30, 30, 30, 104, 30, 30] # sans vin blanc parceque ça shit le aspect ratio - total = sum(iterable) - - # Data = np.genfromtxt('/Users/Shooshoo/PycharmProjects/PCA_DCCLab/DataVino_Sorted.csv', delimiter=',') - db = RamanDB() - data, labels = db.getIntensities() - wavelengths = db.getWavelengths() - wavelengths = np.expand_dims(wavelengths, 1) - - data = np.concatenate( (wavelengths, wavelengths, data[:,0:total]), axis=1 ) - # self.assertEqual(data.shape[1], total) - my_Spectrums = vinoPCA(data, iterable) + my_Spectrums = vinoPCA() self.assertIsNotNone(my_Spectrums) diff --git a/vino.py b/vino.py index 413f58a..2e780f5 100644 --- a/vino.py +++ b/vino.py @@ -9,15 +9,9 @@ class vinoPCA: def __init__(self): - - """ - :param Data: The data on wich PCA should be done. - :param colormap: An iterable that contains how many of each samples there is in Data, in the good order. - """ - self.db = RamanDB() - - self.data, self.labels = self.db.getIntensities() + self.constraints = [] + self.data, self.labels = self.db.getSpectraWithLabels() self.wavelengths = self.db.getWavelengths() self.wavelengthMask = range(200, 1000) diff --git a/wines.txt b/wines.txt new file mode 100644 index 0000000..3b19938 --- /dev/null +++ b/wines.txt @@ -0,0 +1,27 @@ +A 2022/01/12 Wine Sirius Bordeaux 2018 https://www.saq.com/en/223537 VPN France Merlot, Cabernet Sauvignon 2.2 red 13 +B 2022/01/12 Wine Ménage à Trois 2019 https://www.saq.com/en/10709152 VPN United States Cabernet Sauvignon 4.3 red 13.5 +C 2022/01/22 Wine Woodbridge by Robert Mondavi https://www.saq.com/en/48611 VPN United States Cabernet Sauvignon 7.3 red 13.5 +D 2022/01/28 Wine Les Jamelles Pinot Noir Pays d'Oc https://www.saq.com/en/10802904 VPN France point noir 4 red 13 +E 2022/01/27 Wine Monasterio de las Vinas https://www.saq.com/en/854422 VPN Spain 70% Garnacha, 20% Tempranillo, 10% Carinena 2.1 red 13.5 +F 2022/02/05 Wine Revolution https://www.saq.com/en/12166892 EP United States Ruby cabernet 50 %, Carignan 32 %, Syrah 18 % 10 red 13.5 +G 2022/02/12 Wine Milhistoraise https://www.saq.com/en/13794111 EP Spain Grenache 1.7 red 14 +H 2022/02/13 Wine Wallaroo Trail Shiraz https://www.saq.com/en/12498459 EP Australia Shiraz 85 %, Cabernet sauvignon 10 %, Petit verdot 5 % 11 red 13.5 +I 2022/02/13 Wine Toro loco https://futailles.com/en/products/wine/red/toro-loco EP Spain Tempranillo 0 red 12.5 +J 2022/02/13 Wine Cantini https://vinstriani.com/produits/cantini-rouge.html EP Italy Sangiovese, Montepulciano, and Cabernet Sauvignon - red 12 +K 2022/02/13 Wine Nicolas laloux https://www.vinsenepicerie.com/en/nicolas-laloux-1/ EP Ontario.Canada Cabernet Sauvignon - red 12.5 +L 2022/02/13 Wine smoky bay SHIRAZ https://www.lcbo.com/webapp/wcs/stores/servlet/en/lcbo/red-wine-14001/smoky-bay-shiraz-17650#.YguvavXMIUo EP Australia Shiraz 10 red 13 +M 2022/02/13 Wine Dolce Venti https://futailles.com/en/products/wine/red/dolce-venti EP Italy Merlot - red 11.5 +N 2022/02/13 Wine Aroma mi Amore https://vinsarista.com/en/produit/wines/aroma-mi-amore/aroma-mi-amore-red-wine/ EP Italy Refosco - red 14.5 +O 2022/02/19 Wine Sonho Aragonez https://www.vivino.com/CA/en/sonho-aragonez/w/5905886 EP Portugal Aragonez red 12.5 +P 2022/02/27 Wine Double vie https://vinsarista.com/en/produit/wines/double-vie/red-wine/ EP Canada red 12 +Q 2022/02/28 Wine Danza https://www.iga.net/en/product/wineargentinian-red-bonarda/00000_000000082424300222 EP Argentina Douce noir red 13.7 +R 2022/02/23 Wine bu https://www.iga.net/en/product/winered-rosso-terre-sicilaine-bio-it/00000_000000005604913702 EP Italy Nero d'Avola 70% + Merlot 20% + Syrah 10% red 12.5 +S 2022/02/24 Wine Croix d'Or https://futailles.com/en/products/wine/red/croix-dor EP Moldavie pinot noir red 12.5 +T 2022/02/18 Wine AUFKELLEREIEN https://www.iga.net/fr/produit/vin-blancallemagne---fruite-et-doux-9--alcool---18-ans--/00000_000000005604980687 AR Allemagne white 9 +U 2022/02/15 Wine Macon Lugny les Cray https://www.saq.com/en/13319061 DC France Bourgogne white +V 2022/02/16 Wine Brumont Cotes de Gascogne https://www.saq.com/en/548883 DC France Sauvignon, Gros Manseng white 12 +W 2022/02/18 Wine Piuze https://www.saq.com/en/14853741 DC France Chardonnay white 12 +X 2022/02/19 Wine Chateau de Maligny https://www.saq.com/en/560763 DC France Chablis Chardonnay white 12.5 +Y 2022/02/21 Wine L'impromptu https://www.saq.com/en/13343264 DC France Gamay red 14 +Z 2022/02/22 Wine Sancerres Aurore Dezat https://www.saq.com/en/13992897 DC France Sancerre Chardonnay 1.6 white 12.5 +AA 2022/02/26 Wine Lord de la Ragotiere https://www.saq.com/en/10690501 DC France Chardonnay white 12 \ No newline at end of file From 78a731d29ecf51b7d949840eece779d0fb6295be Mon Sep 17 00:00:00 2001 From: Daniel Date: Mon, 28 Mar 2022 22:54:29 -0400 Subject: [PATCH 09/22] MOved to MySQL, so no need for downloading sqlite file. --- ramandb.py | 30 ++++++---------- testDatabase.py | 95 ++++++++++++++++++++++++++++++++++--------------- vino.py | 6 +++- 3 files changed, 81 insertions(+), 50 deletions(-) diff --git a/ramandb.py b/ramandb.py index c168d39..d5e1600 100644 --- a/ramandb.py +++ b/ramandb.py @@ -4,26 +4,16 @@ from BaselineRemoval import BaselineRemoval class RamanDB(Database): - url = 'https://www.dropbox.com/s/peowchyj7xyib4w/raman.db?dl=1' - def __init__(self, writePermission=False): + def __init__(self): """ - Creates the database object for Raman spectra. - - The information as entered by people is here: https://docs.google.com/spreadsheets/d/1CgXRyIr7q3P26GP8Km4r9LuLH5o2APFj-4B72niTI1g/edit#gid=0 + The Database is a MySQL database on cafeine called `raman`. """ - - self.databasePath = "raman.db" - if not os.path.exists(self.databasePath): - print("The raman.db file is not available. Atttempting to download from {0}".format(self.url)) - filename = self.downloadDatabase() - if os.path.exists(filename) and not os.path.exists(self.databasePath): - os.rename(filename, self.databasePath) - print("Success. File has been renamed raman.db") + url = "mysql://dcclab@cafeine2.crulrg.ulaval.ca/dcclab@raman" self._wavelengths = None self.progressStart = None self.constraints = [] - super().__init__(self.databasePath, writePermission=writePermission) + super().__init__(url) def showHelp(self): print(""" @@ -64,7 +54,7 @@ def wavelengths(self): return self._wavelengths def getWavelengths(self): - self.execute(r"select distinct(wavelength) from spectra order by wavelength") + self.execute(r"select distinct(wavelength) from spectra where dataType='raw' order by wavelength") rows = self.fetchAll() nTotal = len(rows) @@ -117,7 +107,7 @@ def getSpectraPaths(self): return paths def getIntensities(self, limit=None): - return self.getSpectraWithWineId(limit=limit) + return self.getSpectraWithId(limit=limit) def getSpectraWithId(self, dataType=None, limit=None): possibleDataTypes = self.getDataTypes() @@ -133,7 +123,7 @@ def getSpectraWithId(self, dataType=None, limit=None): inner join files on files.spectrumId = spectra.spectrumId inner join wines on wines.wineId = files.wineId where dataType = '{0}' - order by files.path, wavelength """.format(dataType) + order by spectra.spectrumId, spectra.wavelength """.format(dataType) wavelengths = self.getWavelengths() nWavelengths = len(wavelengths) @@ -160,14 +150,14 @@ def getSpectraWithId(self, dataType=None, limit=None): return spectra, spectrumIdentifiers def storeCorrectedSpectra(self): - spectra, spectrumIds = self.getSpectraWithId() + spectra, spectrumIds = self.getSpectraWithId(dataType='raw') correctedSpectra = self.subtractFluorescence(spectra) for i in range( correctedSpectra.shape[1]): spectrumId = spectrumIds[i] print("Running for spectrum {0}".format(spectrumId)) for x,y in zip(self.wavelengths, correctedSpectra[:,i]): - self.execute("insert into spectra (wavelength, intensity, spectrumId, dataType, algorithm, dateAdded) values(?, ?, ?, 'fluorescence-corrected', 'BaselineRemoval-degree5', datetime())", (x,y, spectrumId)) - + # self.execute("insert into spectra (wavelength, intensity, spectrumId, dataType, algorithm, dateAdded) values(?, ?, ?, 'fluorescence-corrected', 'BaselineRemoval-degree5', datetime())", (x,y, spectrumId)) + self.execute("insert into spectra (wavelength, intensity, spectrumId, dataType, algorithm, dateAdded) values(%s, %s, %s, 'fluorescence-corrected', 'BaselineRemoval-degree5', datetime())",(x, y, spectrumId)) def subtractFluorescence(self, rawSpectra, polynomialDegree=5): """ diff --git a/testDatabase.py b/testDatabase.py index 02f2295..ca94d2b 100644 --- a/testDatabase.py +++ b/testDatabase.py @@ -6,50 +6,66 @@ import requests class TestBuildDatabase(unittest.TestCase): - def testDatabase(self): + def test01Database(self): db = RamanDB() self.assertIsNotNone(db) - self.assertTrue(os.path.exists(db.databasePath)) - def testWavelengths(self): + def test02Wavelengths(self): db = RamanDB() self.assertIsNotNone(db.getWavelengths()) - self.assertEqual(len(db.getWavelengths()), 1044) - def testWavelengthsProperty(self): + def test03WavelengthsAreUniqueAndCommon(self): + """ + Check that all RAW spectra have the same number of wavelengths. + This is a complex SQL statement with a sub-select, but it returns 1 if true and 0 if false. + """ + db = RamanDB() + db.execute(""" + SELECT + MAX(spectralPts) = MIN(spectralPts) as wavelengthsAreAllTheSame + FROM + (SELECT + COUNT(wavelength) AS spectralPts + FROM + spectra + where dataType='raw' + GROUP BY wavelength) AS something; + """) + firstRecord = db.fetchOne() + self.assertEqual(firstRecord["wavelengthsAreAllTheSame"], 1) + + def test04WavelengthsProperty(self): db = RamanDB() self.assertIsNotNone(db.wavelengths) - self.assertEqual(len(db.wavelengths), 1044) - def testFileCount(self): + def test05FileCount(self): db = RamanDB() self.assertIsNotNone(db.getFileCount()) - self.assertEqual(db.getFileCount(), 709) - def testFilePaths(self): + def test06FileCountShouldMatchRawSpectraTimesWavelength(self): + """ + NUmber of points in the spectra database for 'raw' spectra should be #wavelengths x #files + """ + db = RamanDB() + rawSpectraCount = db.getFileCount() + wavelengthsCount = len(db.getWavelengths()) + + db.execute("select count(*) as count from spectra where dataType='raw'") + valueRecord = db.fetchOne() + self.assertEqual(valueRecord["count"], rawSpectraCount*wavelengthsCount) + + def test07FilePaths(self): db = RamanDB() self.assertIsNotNone(db.getSpectraPaths()) self.assertEqual(db.getFileCount(), len(db.getSpectraPaths())) + @unittest.skip("Ok, tested") def testGetIntensity(self): db = RamanDB() matrix, labels = db.getIntensities() self.assertIsNotNone(matrix) self.assertEqual(matrix.shape, (len(db.wavelengths), db.getFileCount())) - @unittest.skip("Ok, tested") - def testDownload(self): - url = 'https://www.dropbox.com/s/2st0sv7jpii6dz8/raman.db?dl=1' - r = requests.get(url, allow_redirects=True) - with open('test.db', 'wb') as file: - file.write(r.content) - - @unittest.skip("Ok, tested") - def testDownload(self): - db = RamanDB() - filename = db.downloadDatabase() - self.assertTrue(os.path.exists(filename)) - os.remove(filename) @unittest.skip("Done, no need to redo.") def testAddFileIdToDatabase(self): @@ -76,28 +92,49 @@ def testWineIdentifiers(self): def testWinesSummary(self): db = RamanDB() wineSummary = db.getWinesSummary() - print(wineSummary) + totalNumberOfSpectra = sum([ wine["nSamples"] for wine in wineSummary]) + + db.execute("select count(*) as count from spectra where dataType='raw'") + valueRecord = db.fetchOne() + self.assertEqual(valueRecord["count"], totalNumberOfSpectra*len(db.getWavelengths())) def testStoreCorrectedSpectra(self): - db = RamanDB(writePermission=False) + db = RamanDB() db.storeCorrectedSpectra() + def testSingleSpectrum(self): + db = RamanDB() + db.execute("select wavelength, intensity from spectra where spectrumId = '0002-0001'") + records = db.fetchAll() + for record in records: + print(record) + + # def testStoreSingleSpectrum(self): + # spectra, spectrumIds = self.getSpectraWithId(dataType='raw') + # correctedSpectra = self.subtractFluorescence(spectra) + # for i in range( correctedSpectra.shape[1]): + # spectrumId = spectrumIds[i] + # print("Running for spectrum {0}".format(spectrumId)) + # for x,y in zip(self.wavelengths, correctedSpectra[:,i]): + # # self.execute("insert into spectra (wavelength, intensity, spectrumId, dataType, algorithm, dateAdded) values(?, ?, ?, 'fluorescence-corrected', 'BaselineRemoval-degree5', datetime())", (x,y, spectrumId)) + # self.execute("insert into spectra (wavelength, intensity, spectrumId, dataType, algorithm, dateAdded) values(%s, %s, %s, 'fluorescence-corrected', 'BaselineRemoval-degree5', datetime())",(x, y, spectrumId)) + def testDataTypes(self): - db = RamanDB(writePermission=False) - print(db.getDataTypes()) + db = RamanDB() + self.assertTrue('raw' in db.getDataTypes()) def testGetSpectraValidType(self): - db = RamanDB(writePermission=False) + db = RamanDB() spectra, spectrumIds = db.getSpectraWithId(dataType='raw') self.assertIsNotNone(spectra) def testGetSpectraValidTypeFluorescence(self): - db = RamanDB(writePermission=False) + db = RamanDB() spectra, spectrumIds = db.getSpectraWithId(dataType='fluorescence-corrected') self.assertIsNotNone(spectra) def testGetSpectraInvalidType(self): - db = RamanDB(writePermission=False) + db = RamanDB() with self.assertRaises(ValueError): spectra = db.getSpectraWithId(dataType='unknown') diff --git a/vino.py b/vino.py index 2e780f5..04a3db7 100644 --- a/vino.py +++ b/vino.py @@ -11,7 +11,11 @@ class vinoPCA: def __init__(self): self.db = RamanDB() self.constraints = [] - self.data, self.labels = self.db.getSpectraWithLabels() + self.data, self.labels = self.db.getSpectraWithId(dataType='raw') + self.correctedData, correctedLabel = self.db.getSpectraWithId(dataType='fluorescence-corrected') + if self.labels != correctedLabel: + raise ValueError('Not all spectra are corrected') + self.wavelengths = self.db.getWavelengths() self.wavelengthMask = range(200, 1000) From 60dfbb259fb9c6f0c534ba50312115395e3dfe92 Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 29 Mar 2022 00:57:19 -0400 Subject: [PATCH 10/22] FOund a mistake when data was imported (Q100 was labelled sampleId=10, not 100). Fixed it at great lengths, but now red wine test passes. --- ramandb.py | 65 ++++++++++++++++++++++++++++++++++--------------- testDatabase.py | 51 +++++++++++++++++++++++++++++++++++--- 2 files changed, 92 insertions(+), 24 deletions(-) diff --git a/ramandb.py b/ramandb.py index d5e1600..f8020a5 100644 --- a/ramandb.py +++ b/ramandb.py @@ -39,13 +39,6 @@ def execute(self, statement, bindings=None): else: self.cursor.execute(statement, bindings) - def downloadDatabase(self): - r = requests.get(self.url, allow_redirects=True) - filename = "raman-download.db" - with open(filename, 'wb') as file: - file.write(r.content) - return filename - @property def wavelengths(self): if self._wavelengths is None: @@ -53,6 +46,28 @@ def wavelengths(self): return self._wavelengths + def readQEProFile(self, filePath): + # text_file = open(filePath, "br") + # hash = hashlib.md5(text_file.read()).hexdigest() + # text_file.close() + + text_file = open(filePath, "r") + lines = text_file.read().splitlines() + + wavelengths = [] + intensities = [] + for line in lines: + match = re.match(r'^\s*(\d+\.?\d+)\s+(-?\d*\.?\d*)', line) + if match is not None: + intensity = match.group(2) + wavelength = match.group(1) + wavelengths.append(wavelength) + intensities.append(intensity) + else: + pass + # print("Line does not match: {0}".format(line)) + return wavelengths, intensities + def getWavelengths(self): self.execute(r"select distinct(wavelength) from spectra where dataType='raw' order by wavelength") rows = self.fetchAll() @@ -73,7 +88,7 @@ def getDataTypes(self): return dataTypes - def getIdentifiers(self): + def getWineIds(self): self.execute(r"select count(*) as count, wineId as id from files group by wineId order by wineId;") rows = self.fetchAll() identifiers = {} @@ -106,24 +121,30 @@ def getSpectraPaths(self): paths.append(row['path']) return paths - def getIntensities(self, limit=None): - return self.getSpectraWithId(limit=limit) - - def getSpectraWithId(self, dataType=None, limit=None): + def getSpectraWithId(self, dataType=None, color=None, limit=None): + whereConstraints = [] possibleDataTypes = self.getDataTypes() if dataType is None: dataType = 'raw' - if dataType not in possibleDataTypes: raise ValueError('Possible dataTypes are {0}'.format(possibleDataTypes)) + whereConstraints.append("dataType = '{0}'".format(dataType)) + + if color is not None: + whereConstraints.append("color = '{0}'".format(color)) + + if len(whereConstraints) != 0: + whereClause = "where " + " and ".join(whereConstraints) + else: + whereClause = "" stmnt = """ select wavelength, intensity, spectra.spectrumId, wines.* from spectra inner join files on files.spectrumId = spectra.spectrumId - inner join wines on wines.wineId = files.wineId - where dataType = '{0}' - order by spectra.spectrumId, spectra.wavelength """.format(dataType) + inner join wines on wines.wineId = spectra.wineId + {0} + order by spectra.spectrumId, spectra.wavelength """.format(whereClause ) wavelengths = self.getWavelengths() nWavelengths = len(wavelengths) @@ -132,11 +153,15 @@ def getSpectraWithId(self, dataType=None, limit=None): stmnt += " limit {0}".format(limit*nWavelengths) self.execute(stmnt) - rows = list(self.fetchAll()) - if rows is None: - return None - + rows = [] + row = self.fetchOne() + while row is not None: + rows.append(row) + if len(rows) % 100 == 0: + print(".", end='') + row = self.fetchOne() + nSamples = len(rows)//nWavelengths if nSamples == 0: return None diff --git a/testDatabase.py b/testDatabase.py index ca94d2b..cddb7c9 100644 --- a/testDatabase.py +++ b/testDatabase.py @@ -4,6 +4,7 @@ import os from ramandb import RamanDB import requests +import re class TestBuildDatabase(unittest.TestCase): def test01Database(self): @@ -59,13 +60,55 @@ def test07FilePaths(self): self.assertIsNotNone(db.getSpectraPaths()) self.assertEqual(db.getFileCount(), len(db.getSpectraPaths())) - @unittest.skip("Ok, tested") - def testGetIntensity(self): + def test08GetWhiteSpectra(self): db = RamanDB() - matrix, labels = db.getIntensities() + db.execute("select count(*) as count from files inner join wines on wines.wineId = files.wineId where wines.color = 'white'") + firstRecord = db.fetchOne() + whiteWineFileCount = firstRecord["count"] + + matrix, labels = db.getSpectraWithId(dataType='raw', color='white') + self.assertIsNotNone(matrix) + + self.assertEqual(matrix.shape, (len(db.wavelengths), whiteWineFileCount)) + + def test09GetRedSpectra(self): + db = RamanDB() + db.execute("select count(*) as count from files inner join wines on wines.wineId = files.wineId where wines.color = 'red'") + firstRecord = db.fetchOne() + redWineFileCount = firstRecord["count"] + + matrix, labels = db.getSpectraWithId(dataType='raw', color='red') self.assertIsNotNone(matrix) - self.assertEqual(matrix.shape, (len(db.wavelengths), db.getFileCount())) + self.assertEqual(matrix.shape, (len(db.wavelengths), redWineFileCount)) + + def testReadQEProFile(self): + db = RamanDB() + wavelengths, intensities = db.readQEProFile('originaldata/Q100.txt') + self.assertEqual(len(intensities), 1044) + + @unittest.skip("Done to fix a bad import, no need to redo") + def testInsertQSpectra(self): + db = RamanDB() + + dataDir = 'originaldata' + filePaths = os.listdir(dataDir) + for filename in filePaths: + match = re.search(r'Q(\d+)', filename) + if match is None: + continue + filePath = os.path.join(dataDir, filename) + print("Inserting {0}".format( filePath )) + sampleId = int(match.group(1)) + spectrumId = "0016-{0:04d}".format(sampleId) + + wavelengths, intensities = db.readQEProFile(filePath) + values = [] + for x,y in zip(wavelengths, intensities): + values.append("({0}, {1}, 'raw', 16, {2}, '{3}') ".format(x,y, sampleId, spectrumId)) + + bigStatement = "insert into spectra (wavelength, intensity, dataType, wineId, sampleId, spectrumId) values" + ','.join(values) + db.execute( bigStatement) @unittest.skip("Done, no need to redo.") def testAddFileIdToDatabase(self): From 3272558a6c6da9631dbb8cd146d530f075c64af0 Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 29 Mar 2022 01:39:45 -0400 Subject: [PATCH 11/22] Better functions, now can select color from getSpectra, new insert function from file --- ramandb.py | 19 +++++++++++++++++++ testDatabase.py | 32 ++++++++------------------------ 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/ramandb.py b/ramandb.py index f8020a5..2aa71c5 100644 --- a/ramandb.py +++ b/ramandb.py @@ -68,6 +68,25 @@ def readQEProFile(self, filePath): # print("Line does not match: {0}".format(line)) return wavelengths, intensities + def insertSpectralDataFromFiles(self, filePaths): + for filePath in filePaths: + match = re.search(r'([A-Z]{1,2})_?(\d{1,3})\.', filePath) + if match is None: + raise ValueError("The file does not appear to have a valid name: {0}".format(filePath)) + + wineId = int(ord(match.group(1))-ord('A')) + sampleId = int(match.group(2)) + spectrumId = "{0:04}-{1:04d}".format(wineId, sampleId) + + print("Inserting {0}".format( filePath )) + wavelengths, intensities = self.readQEProFile(filePath) + values = [] + for x,y in zip(wavelengths, intensities): + values.append("({0}, {1}, 'raw', 16, {2}, '{3}') ".format(x,y, sampleId, spectrumId)) + + bigStatement = "insert into spectra (wavelength, intensity, dataType, wineId, sampleId, spectrumId) values" + ','.join(values) + self.execute( bigStatement) + def getWavelengths(self): self.execute(r"select distinct(wavelength) from spectra where dataType='raw' order by wavelength") rows = self.fetchAll() diff --git a/testDatabase.py b/testDatabase.py index cddb7c9..3601831 100644 --- a/testDatabase.py +++ b/testDatabase.py @@ -88,27 +88,15 @@ def testReadQEProFile(self): self.assertEqual(len(intensities), 1044) @unittest.skip("Done to fix a bad import, no need to redo") - def testInsertQSpectra(self): + def testInsertAllSpectra(self): db = RamanDB() - dataDir = 'originaldata' - filePaths = os.listdir(dataDir) - for filename in filePaths: - match = re.search(r'Q(\d+)', filename) - if match is None: - continue - filePath = os.path.join(dataDir, filename) - print("Inserting {0}".format( filePath )) - sampleId = int(match.group(1)) - spectrumId = "0016-{0:04d}".format(sampleId) - - wavelengths, intensities = db.readQEProFile(filePath) - values = [] - for x,y in zip(wavelengths, intensities): - values.append("({0}, {1}, 'raw', 16, {2}, '{3}') ".format(x,y, sampleId, spectrumId)) - - bigStatement = "insert into spectra (wavelength, intensity, dataType, wineId, sampleId, spectrumId) values" + ','.join(values) - db.execute( bigStatement) + filenames = os.listdir(dataDir) + filePaths = [] + for filename in filenames: + filePaths.append(os.path.join(dataDir, filename)) + + db.insertSpectralDataFromFiles(filePaths) @unittest.skip("Done, no need to redo.") def testAddFileIdToDatabase(self): @@ -141,6 +129,7 @@ def testWinesSummary(self): valueRecord = db.fetchOne() self.assertEqual(valueRecord["count"], totalNumberOfSpectra*len(db.getWavelengths())) + @unittest.skip("Not ready") def testStoreCorrectedSpectra(self): db = RamanDB() db.storeCorrectedSpectra() @@ -166,11 +155,6 @@ def testDataTypes(self): db = RamanDB() self.assertTrue('raw' in db.getDataTypes()) - def testGetSpectraValidType(self): - db = RamanDB() - spectra, spectrumIds = db.getSpectraWithId(dataType='raw') - self.assertIsNotNone(spectra) - def testGetSpectraValidTypeFluorescence(self): db = RamanDB() spectra, spectrumIds = db.getSpectraWithId(dataType='fluorescence-corrected') From 6c6c0115ce6930b62595eb557491e240f37a5863 Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 29 Mar 2022 01:46:48 -0400 Subject: [PATCH 12/22] Fixed a debugging mistake, where wineId was always 16. --- ramandb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ramandb.py b/ramandb.py index 2aa71c5..b0286d4 100644 --- a/ramandb.py +++ b/ramandb.py @@ -82,7 +82,7 @@ def insertSpectralDataFromFiles(self, filePaths): wavelengths, intensities = self.readQEProFile(filePath) values = [] for x,y in zip(wavelengths, intensities): - values.append("({0}, {1}, 'raw', 16, {2}, '{3}') ".format(x,y, sampleId, spectrumId)) + values.append("({0}, {1}, 'raw', {2}, {3}, '{4}') ".format(x,y, wineId, sampleId, spectrumId)) bigStatement = "insert into spectra (wavelength, intensity, dataType, wineId, sampleId, spectrumId) values" + ','.join(values) self.execute( bigStatement) From ff27179ed3f574fc22225fe1ef099cd5eae67394 Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 29 Mar 2022 07:30:36 -0400 Subject: [PATCH 13/22] Better tests, some useless functions removed --- testDatabase.py | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/testDatabase.py b/testDatabase.py index 3601831..3cf2bc3 100644 --- a/testDatabase.py +++ b/testDatabase.py @@ -116,10 +116,6 @@ def testAddFileIdToDatabase(self): def testBuildWineIdAndSampleId(self): db.execute('update files set sampleId=substr(path,18,2) where path like "%\_%" ESCAPE "\"') - def testWineIdentifiers(self): - db = RamanDB() - print(db.getIdentifiers()) - def testWinesSummary(self): db = RamanDB() wineSummary = db.getWinesSummary() @@ -129,7 +125,7 @@ def testWinesSummary(self): valueRecord = db.fetchOne() self.assertEqual(valueRecord["count"], totalNumberOfSpectra*len(db.getWavelengths())) - @unittest.skip("Not ready") + @unittest.skip("This function is not ready") def testStoreCorrectedSpectra(self): db = RamanDB() db.storeCorrectedSpectra() @@ -141,24 +137,15 @@ def testSingleSpectrum(self): for record in records: print(record) - # def testStoreSingleSpectrum(self): - # spectra, spectrumIds = self.getSpectraWithId(dataType='raw') - # correctedSpectra = self.subtractFluorescence(spectra) - # for i in range( correctedSpectra.shape[1]): - # spectrumId = spectrumIds[i] - # print("Running for spectrum {0}".format(spectrumId)) - # for x,y in zip(self.wavelengths, correctedSpectra[:,i]): - # # self.execute("insert into spectra (wavelength, intensity, spectrumId, dataType, algorithm, dateAdded) values(?, ?, ?, 'fluorescence-corrected', 'BaselineRemoval-degree5', datetime())", (x,y, spectrumId)) - # self.execute("insert into spectra (wavelength, intensity, spectrumId, dataType, algorithm, dateAdded) values(%s, %s, %s, 'fluorescence-corrected', 'BaselineRemoval-degree5', datetime())",(x, y, spectrumId)) - def testDataTypes(self): db = RamanDB() self.assertTrue('raw' in db.getDataTypes()) def testGetSpectraValidTypeFluorescence(self): db = RamanDB() - spectra, spectrumIds = db.getSpectraWithId(dataType='fluorescence-corrected') - self.assertIsNotNone(spectra) + if 'fluorescence-corrected' in db.getDataTypes(): + spectra, spectrumIds = db.getSpectraWithId(dataType='fluorescence-corrected') + self.assertIsNotNone(spectra) def testGetSpectraInvalidType(self): db = RamanDB() From 9b52d05b730a16dd39b15d71cef2b3d7125a0353 Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 29 Mar 2022 08:25:52 -0400 Subject: [PATCH 14/22] Update testDatabase.py --- testDatabase.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/testDatabase.py b/testDatabase.py index 3cf2bc3..add5db8 100644 --- a/testDatabase.py +++ b/testDatabase.py @@ -146,6 +146,8 @@ def testGetSpectraValidTypeFluorescence(self): if 'fluorescence-corrected' in db.getDataTypes(): spectra, spectrumIds = db.getSpectraWithId(dataType='fluorescence-corrected') self.assertIsNotNone(spectra) + else: + self.skipTest("No background-corrected spectra in database") def testGetSpectraInvalidType(self): db = RamanDB() From f16f9cc09a2fa023b9c8a992f87e31e190beb86d Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 29 Mar 2022 09:46:29 -0400 Subject: [PATCH 15/22] Added new function in testdatabase to all insert corrected spectra, remove old version in RamanDB --- ramandb.py | 24 ++++++++++-------------- testDatabase.py | 26 +++++++++++++------------- 2 files changed, 23 insertions(+), 27 deletions(-) diff --git a/ramandb.py b/ramandb.py index b0286d4..42cf851 100644 --- a/ramandb.py +++ b/ramandb.py @@ -80,12 +80,17 @@ def insertSpectralDataFromFiles(self, filePaths): print("Inserting {0}".format( filePath )) wavelengths, intensities = self.readQEProFile(filePath) - values = [] - for x,y in zip(wavelengths, intensities): - values.append("({0}, {1}, 'raw', {2}, {3}, '{4}') ".format(x,y, wineId, sampleId, spectrumId)) + self.insertSpectralData(wavelengths, intensities, 'test', wineId, sampleId) - bigStatement = "insert into spectra (wavelength, intensity, dataType, wineId, sampleId, spectrumId) values" + ','.join(values) - self.execute( bigStatement) + def insertSpectralData(self, wavelengths, intensities, dataType, wineId, sampleId, algorithm=None): + spectrumId = "{0:04}-{1:04d}".format(wineId, sampleId) + + values = [] + for x,y in zip(wavelengths, intensities): + values.append("({0}, {1}, '{2}', {3}, {4}, '{5}', now(), '{6}') ".format(x,y, dataType, wineId, sampleId, spectrumId, algorithm)) + + bigStatement = "insert into spectra (wavelength, intensity, dataType, wineId, sampleId, spectrumId, dateAdded, algorithm) values" + ','.join(values) + self.execute( bigStatement) def getWavelengths(self): self.execute(r"select distinct(wavelength) from spectra where dataType='raw' order by wavelength") @@ -193,15 +198,6 @@ def getSpectraWithId(self, dataType=None, color=None, limit=None): return spectra, spectrumIdentifiers - def storeCorrectedSpectra(self): - spectra, spectrumIds = self.getSpectraWithId(dataType='raw') - correctedSpectra = self.subtractFluorescence(spectra) - for i in range( correctedSpectra.shape[1]): - spectrumId = spectrumIds[i] - print("Running for spectrum {0}".format(spectrumId)) - for x,y in zip(self.wavelengths, correctedSpectra[:,i]): - # self.execute("insert into spectra (wavelength, intensity, spectrumId, dataType, algorithm, dateAdded) values(?, ?, ?, 'fluorescence-corrected', 'BaselineRemoval-degree5', datetime())", (x,y, spectrumId)) - self.execute("insert into spectra (wavelength, intensity, spectrumId, dataType, algorithm, dateAdded) values(%s, %s, %s, 'fluorescence-corrected', 'BaselineRemoval-degree5', datetime())",(x, y, spectrumId)) def subtractFluorescence(self, rawSpectra, polynomialDegree=5): """ diff --git a/testDatabase.py b/testDatabase.py index add5db8..ec38ff5 100644 --- a/testDatabase.py +++ b/testDatabase.py @@ -87,7 +87,7 @@ def testReadQEProFile(self): wavelengths, intensities = db.readQEProFile('originaldata/Q100.txt') self.assertEqual(len(intensities), 1044) - @unittest.skip("Done to fix a bad import, no need to redo") + # @unittest.skip("Done to fix a bad import, no need to redo") def testInsertAllSpectra(self): db = RamanDB() dataDir = 'originaldata' @@ -98,19 +98,19 @@ def testInsertAllSpectra(self): db.insertSpectralDataFromFiles(filePaths) - @unittest.skip("Done, no need to redo.") - def testAddFileIdToDatabase(self): - db = RamanDB(writePermission=True) - db.execute("select * from files order by path") - records = db.fetchAll() - for i, record in enumerate(records): - db.execute("update files set fid={0} where md5='{1}'".format(i, record["md5"])) + def testInsertAllCorrectedSpectra(self): + db = RamanDB() + spectra, labels = db.getSpectraWithId(dataType='raw') + degree = 100 + correctedSpectra = db.subtractFluorescence(spectra, polynomialDegree=degree) - db.execute("select spectra.md5, files.fid from spectra inner join files on files.md5 = spectra.md5") - records = db.fetchAll() - for i, record in enumerate(records): - statement = "update spectra set fid={0} where md5='{1}'".format(record["fid"], record["md5"]) - db.execute(statement) + for i, label in enumerate(labels): + print("{0}/{1}".format(i, len(labels))) + + match = re.search(r"(\d+)-(\d+)", label) + wineId = int(match.group(1)) + sampleId = int(match.group(2)) + db.insertSpectralData(db.wavelengths, correctedSpectra[:,i], 'fluorescence-corrected', wineId, sampleId, 'BaselineRemoval-nomask-degree{0}'.format(degree)) @unittest.skip("done") def testBuildWineIdAndSampleId(self): From 57955e93433c69bbc9375c4b38b5f9c4ec09336b Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 29 Mar 2022 17:39:09 -0400 Subject: [PATCH 16/22] Small speed improvements Getting spectra is 4x faster thanks to a better subselect --- ramandb.py | 95 +++++++++++++++++++++++++++++++++++++++++++------ testDatabase.py | 35 +++++++++--------- 2 files changed, 103 insertions(+), 27 deletions(-) diff --git a/ramandb.py b/ramandb.py index 42cf851..6927730 100644 --- a/ramandb.py +++ b/ramandb.py @@ -39,6 +39,24 @@ def execute(self, statement, bindings=None): else: self.cursor.execute(statement, bindings) + def executeCount(self, statement, bindings=None): + """ + This function with "bindings" is necessary to handle binary data: it cannot be inserted with a string statement. + The bindings are explained here: https://zetcode.com/db/sqlitepythontutorial/ and are similar to .format() + but are handled properly by the sqlite3 module instead of a python string. Without it, binary data + is inserted as a string, which is not good. + + See insertFileContentIntoSources() for an example. + + """ + self.execute(statement, bindings) + singleRecord = self.fetchOne() + keys = list(singleRecord.keys()) + if len(keys) == 1: + return int(singleRecord[keys[0]]) + else: + return None + @property def wavelengths(self): if self._wavelengths is None: @@ -68,7 +86,8 @@ def readQEProFile(self, filePath): # print("Line does not match: {0}".format(line)) return wavelengths, intensities - def insertSpectralDataFromFiles(self, filePaths): + def insertSpectralDataFromFiles(self, filePaths, dataType='raw'): + inserted = 0 for filePath in filePaths: match = re.search(r'([A-Z]{1,2})_?(\d{1,3})\.', filePath) if match is None: @@ -78,16 +97,26 @@ def insertSpectralDataFromFiles(self, filePaths): sampleId = int(match.group(2)) spectrumId = "{0:04}-{1:04d}".format(wineId, sampleId) - print("Inserting {0}".format( filePath )) wavelengths, intensities = self.readQEProFile(filePath) - self.insertSpectralData(wavelengths, intensities, 'test', wineId, sampleId) + try: + self.insertSpectralData(wavelengths, intensities, dataType, wineId, sampleId) + print("Inserted {0}".format(filePath)) + inserted += 1 + except ValueError as err: + print(err) + + return inserted def insertSpectralData(self, wavelengths, intensities, dataType, wineId, sampleId, algorithm=None): spectrumId = "{0:04}-{1:04d}".format(wineId, sampleId) + count = self.executeCount('select count(*) as count from spectra where spectrumId = "{0}" and dataType = "{1}"'.format(spectrumId, dataType)) + if count != 0 : + raise ValueError("Spectrum {0} already exists with dataType='{1}'".format(spectrumId, dataType)) + values = [] for x,y in zip(wavelengths, intensities): - values.append("({0}, {1}, '{2}', {3}, {4}, '{5}', now(), '{6}') ".format(x,y, dataType, wineId, sampleId, spectrumId, algorithm)) + values.append("({0}, {1}, '{2}', {3}, {4}, '{5}', now(), '{6}') ".format(x,float(y), dataType, wineId, sampleId, spectrumId, algorithm)) bigStatement = "insert into spectra (wavelength, intensity, dataType, wineId, sampleId, spectrumId, dateAdded, algorithm) values" + ','.join(values) self.execute( bigStatement) @@ -104,7 +133,7 @@ def getWavelengths(self): return wavelengths def getDataTypes(self): - self.execute('select dataType from spectra group by dataType') + self.execute('select distinct dataType from spectra') rows = self.fetchAll() dataTypes = [] for row in rows: @@ -145,6 +174,53 @@ def getSpectraPaths(self): paths.append(row['path']) return paths + def getSpectrum(self, dataType, spectrumId): + whereConstraints = [] + possibleDataTypes = self.getDataTypes() + + if dataType is None: + dataType = 'raw' + if dataType not in possibleDataTypes: + raise ValueError('Possible dataTypes are {0}'.format(possibleDataTypes)) + whereConstraints.append("dataType = '{0}'".format(dataType)) + + whereConstraints.append("spectrumId = '{0}'".format(spectrumId)) + + if len(whereConstraints) != 0: + whereClause = "where " + " and ".join(whereConstraints) + else: + whereClause = "" + + stmnt = """ + select wavelength, intensity, spectra.spectrumId from spectra + {0} + order by spectra.spectrumId, spectra.wavelength """.format(whereClause ) + + wavelengths = self.getWavelengths() + nWavelengths = len(wavelengths) + + self.execute(stmnt) + + rows = [] + row = self.fetchOne() + while row is not None: + rows.append(row) + if len(rows) % 100 == 0: + print(".", end='') + row = self.fetchOne() + + nSamples = len(rows)//nWavelengths + if nSamples == 0: + return None + + spectra = np.zeros(shape=(nWavelengths, nSamples)) + spectrumIdentifiers = [""]*nSamples + for i,row in enumerate(rows): + spectra[i%nWavelengths, i//nWavelengths] = float(row['intensity']) + spectrumIdentifiers[i//nWavelengths] = row['spectrumId'] + + return spectra, spectrumIdentifiers + def getSpectraWithId(self, dataType=None, color=None, limit=None): whereConstraints = [] possibleDataTypes = self.getDataTypes() @@ -156,7 +232,7 @@ def getSpectraWithId(self, dataType=None, color=None, limit=None): whereConstraints.append("dataType = '{0}'".format(dataType)) if color is not None: - whereConstraints.append("color = '{0}'".format(color)) + whereConstraints.append(' wineId in (select wineId from wines where color="{0}") '.format(color)) if len(whereConstraints) != 0: whereClause = "where " + " and ".join(whereConstraints) @@ -164,10 +240,9 @@ def getSpectraWithId(self, dataType=None, color=None, limit=None): whereClause = "" stmnt = """ - select wavelength, intensity, spectra.spectrumId, wines.* from spectra - inner join files on files.spectrumId = spectra.spectrumId - inner join wines on wines.wineId = spectra.wineId - {0} + select wavelength, intensity, spectra.spectrumId + from spectra + {0} order by spectra.spectrumId, spectra.wavelength """.format(whereClause ) wavelengths = self.getWavelengths() diff --git a/testDatabase.py b/testDatabase.py index ec38ff5..0ab1945 100644 --- a/testDatabase.py +++ b/testDatabase.py @@ -6,7 +6,7 @@ import requests import re -class TestBuildDatabase(unittest.TestCase): +class TestRamanDatabase(unittest.TestCase): def test01Database(self): db = RamanDB() self.assertIsNotNone(db) @@ -87,7 +87,6 @@ def testReadQEProFile(self): wavelengths, intensities = db.readQEProFile('originaldata/Q100.txt') self.assertEqual(len(intensities), 1044) - # @unittest.skip("Done to fix a bad import, no need to redo") def testInsertAllSpectra(self): db = RamanDB() dataDir = 'originaldata' @@ -96,21 +95,28 @@ def testInsertAllSpectra(self): for filename in filenames: filePaths.append(os.path.join(dataDir, filename)) - db.insertSpectralDataFromFiles(filePaths) + inserted = db.insertSpectralDataFromFiles(filePaths) + if inserted == 0: + self.skipTest("Nothing was inserted") - def testInsertAllCorrectedSpectra(self): + def testExecuteCount(self): db = RamanDB() - spectra, labels = db.getSpectraWithId(dataType='raw') - degree = 100 - correctedSpectra = db.subtractFluorescence(spectra, polynomialDegree=degree) - - for i, label in enumerate(labels): - print("{0}/{1}".format(i, len(labels))) + self.assertTrue(db.executeCount("select count(*) as count from spectra") > 0) - match = re.search(r"(\d+)-(\d+)", label) + def testInsertAllCorrectedSpectra(self): + db = RamanDB() + db.execute("select distinct spectrumId from spectra") + records = db.fetchAll() + for record in records: + spectrumId = record["spectrumId"] + spectrum, labels = db.getSpectrum(dataType='raw', spectrumId=spectrumId) + degree = 100 + correctedSpectrum = db.subtractFluorescence(spectrum, polynomialDegree=degree) + print(spectrumId) + match = re.search(r"(\d+)-(\d+)", spectrumId) wineId = int(match.group(1)) sampleId = int(match.group(2)) - db.insertSpectralData(db.wavelengths, correctedSpectra[:,i], 'fluorescence-corrected', wineId, sampleId, 'BaselineRemoval-nomask-degree{0}'.format(degree)) + db.insertSpectralData(db.wavelengths, correctedSpectrum[:,:], 'fluorescence-corrected', wineId, sampleId, 'BaselineRemoval-nomask-degree{0}'.format(degree)) @unittest.skip("done") def testBuildWineIdAndSampleId(self): @@ -125,11 +131,6 @@ def testWinesSummary(self): valueRecord = db.fetchOne() self.assertEqual(valueRecord["count"], totalNumberOfSpectra*len(db.getWavelengths())) - @unittest.skip("This function is not ready") - def testStoreCorrectedSpectra(self): - db = RamanDB() - db.storeCorrectedSpectra() - def testSingleSpectrum(self): db = RamanDB() db.execute("select wavelength, intensity from spectra where spectrumId = '0002-0001'") From b41f88ce19b87c2dded6b41ae0d054478b8d3507 Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 29 Mar 2022 17:44:39 -0400 Subject: [PATCH 17/22] Better checks before inserts to avoid duplicates. --- testDatabase.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/testDatabase.py b/testDatabase.py index 0ab1945..a667614 100644 --- a/testDatabase.py +++ b/testDatabase.py @@ -105,8 +105,11 @@ def testExecuteCount(self): def testInsertAllCorrectedSpectra(self): db = RamanDB() - db.execute("select distinct spectrumId from spectra") + db.execute("select distinct spectrumId from spectra where spectrumId not in (select spectrumId from spectra where dataType='fluorescence-corrected')") records = db.fetchAll() + if len(records) == 0: + self.skipTest("All corrected spectra exist in the database") + for record in records: spectrumId = record["spectrumId"] spectrum, labels = db.getSpectrum(dataType='raw', spectrumId=spectrumId) From 1b91f9de6f870bd84d9480243b9a9f28f8fa8ea6 Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 29 Mar 2022 23:40:26 -0400 Subject: [PATCH 18/22] New test for local database/remote database --- ramandb.py | 104 ++++++++++++++++++++++++++++-------- testDatabase.py | 139 +++++++++++++++++++++++++----------------------- 2 files changed, 153 insertions(+), 90 deletions(-) diff --git a/ramandb.py b/ramandb.py index 6927730..3725b18 100644 --- a/ramandb.py +++ b/ramandb.py @@ -1,19 +1,27 @@ +import dcclab from dcclab.database import * import numpy as np import requests from BaselineRemoval import BaselineRemoval +import re -class RamanDB(Database): - def __init__(self): +class RamanDB(dcclab.database.Database): + def __init__(self, databaseURL = None): """ The Database is a MySQL database on cafeine called `raman`. """ - url = "mysql://dcclab@cafeine2.crulrg.ulaval.ca/dcclab@raman" + if databaseURL is None: + databaseURL = "mysql://dcclab@cafeine2.crulrg.ulaval.ca/dcclab@raman" self._wavelengths = None + self._wavelengthMask = None self.progressStart = None self.constraints = [] - super().__init__(url) + self.pumpWavelengthInNm = 785 + super().__init__(databaseURL) + + if dcclab.__version__ < "1.0.3": + print("You should update PyDCCLab with `pip install dcclab` to get the latest version.") def showHelp(self): print(""" @@ -57,6 +65,22 @@ def executeCount(self, statement, bindings=None): else: return None + def parseURL(self, url): + #mysql://sshusername:sshpassword@cafeine2.crulrg.ulaval.ca/mysqlusername:mysqlpassword@questions + if dcclab.__version__ >= "1.0.4": + print("No need to patch parseURL in this dcclab version") + + match = re.search("(mysql)://(.*?)@?([^@]+?)/(.*?)@(.+)", url) + if match is not None: + protocol = Engine.mysql + sshuser = match.group(2) + host = match.group(3) + mysqluser = match.group(4) + database = match.group(5) + return (protocol, sshuser, host, mysqluser, database) + else: + return (Engine.sqlite3, None, "127.0.0.1", None, url) + @property def wavelengths(self): if self._wavelengths is None: @@ -64,26 +88,54 @@ def wavelengths(self): return self._wavelengths + @property + def wavenumbers(self): + return 1e7*(1.0/self.pumpWavelengthInNm - 1.0/self.wavelengths) + + @property + def wavelengthMask(self): + if self._wavelengthMask is None: + self._wavelengthMask = self.getWavelengthMask() + + return self._wavelengthMask + + def getWavelengthMask(self): + self.execute(r"select distinct(wavelength), intensity from spectra where dataType='mask-wine' order by wavelength") + rows = self.fetchAll() + nTotal = len(rows) + + if nTotal != 0: + mask = np.zeros(shape=(nTotal),dtype=bool) + for i,row in enumerate(rows): + mask[i] = bool(row['intensity']) + else: + mask = np.zeros(shape=(len(self.wavelengths))) + for i in range(200, 1000): + mask[i] = True + self.insertSpectralData(wavelengths=self.wavelengths, intensities=mask, dataType='mask-wine', wineId=None, sampleId=None, algorithm='BaselineRemoval') + + return mask + def readQEProFile(self, filePath): # text_file = open(filePath, "br") # hash = hashlib.md5(text_file.read()).hexdigest() # text_file.close() - text_file = open(filePath, "r") - lines = text_file.read().splitlines() - - wavelengths = [] - intensities = [] - for line in lines: - match = re.match(r'^\s*(\d+\.?\d+)\s+(-?\d*\.?\d*)', line) - if match is not None: - intensity = match.group(2) - wavelength = match.group(1) - wavelengths.append(wavelength) - intensities.append(intensity) - else: - pass - # print("Line does not match: {0}".format(line)) + with open(filePath, "r") as text_file: + lines = text_file.read().splitlines() + + wavelengths = [] + intensities = [] + for line in lines: + match = re.match(r'^\s*(\d+\.?\d+)\s+(-?\d*\.?\d*)', line) + if match is not None: + intensity = match.group(2) + wavelength = match.group(1) + wavelengths.append(wavelength) + intensities.append(intensity) + else: + pass + # print("Line does not match: {0}".format(line)) return wavelengths, intensities def insertSpectralDataFromFiles(self, filePaths, dataType='raw'): @@ -108,7 +160,10 @@ def insertSpectralDataFromFiles(self, filePaths, dataType='raw'): return inserted def insertSpectralData(self, wavelengths, intensities, dataType, wineId, sampleId, algorithm=None): - spectrumId = "{0:04}-{1:04d}".format(wineId, sampleId) + if wineId is None or sampleId is None: + spectrumId = None + else: + spectrumId = "{0:04}-{1:04d}".format(wineId, sampleId) count = self.executeCount('select count(*) as count from spectra where spectrumId = "{0}" and dataType = "{1}"'.format(spectrumId, dataType)) if count != 0 : @@ -116,7 +171,7 @@ def insertSpectralData(self, wavelengths, intensities, dataType, wineId, sampleI values = [] for x,y in zip(wavelengths, intensities): - values.append("({0}, {1}, '{2}', {3}, {4}, '{5}', now(), '{6}') ".format(x,float(y), dataType, wineId, sampleId, spectrumId, algorithm)) + values.append("({0}, {1}, '{2}', '{3}', '{4}', '{5}', now(), '{6}') ".format(x,float(y), dataType, wineId, sampleId, spectrumId, algorithm)) bigStatement = "insert into spectra (wavelength, intensity, dataType, wineId, sampleId, spectrumId, dateAdded, algorithm) values" + ','.join(values) self.execute( bigStatement) @@ -152,6 +207,9 @@ def getWineIds(self): return identifiers def getWinesSummary(self): + # mysql.connector.errors.ProgrammingError: 1055( + # 42000): Expression # 4 of SELECT list is not in GROUP BY clause and contains nonaggregated column 'raman.wines.dateOpened' which is not functionally dependent on columns in GROUP BY clause; this is incompatible with sql_mode=only_full_group_by + self.execute(r"select files.wineId, count(*) as nSamples, wines.* from files inner join wines on wines.wineId = files.wineId group by files.wineId order by files.wineId") rows = self.fetchAll() wines = [] @@ -211,7 +269,7 @@ def getSpectrum(self, dataType, spectrumId): nSamples = len(rows)//nWavelengths if nSamples == 0: - return None + return None, None spectra = np.zeros(shape=(nWavelengths, nSamples)) spectrumIdentifiers = [""]*nSamples @@ -245,7 +303,7 @@ def getSpectraWithId(self, dataType=None, color=None, limit=None): {0} order by spectra.spectrumId, spectra.wavelength """.format(whereClause ) - wavelengths = self.getWavelengths() + wavelengths = self.wavelengths nWavelengths = len(wavelengths) if limit is not None: diff --git a/testDatabase.py b/testDatabase.py index a667614..11af9f6 100644 --- a/testDatabase.py +++ b/testDatabase.py @@ -7,21 +7,25 @@ import re class TestRamanDatabase(unittest.TestCase): + def setUp(self): + self.db = RamanDB() + # self.db = RamanDB("mysql://127.0.0.1/root@raman") + self.assertIsNotNone(self.db) + + @unittest.skip("Now in setUp") def test01Database(self): - db = RamanDB() - self.assertIsNotNone(db) + self.db = RamanDB() + self.assertIsNotNone(self.db) def test02Wavelengths(self): - db = RamanDB() - self.assertIsNotNone(db.getWavelengths()) + self.assertIsNotNone(self.db.getWavelengths()) def test03WavelengthsAreUniqueAndCommon(self): """ Check that all RAW spectra have the same number of wavelengths. This is a complex SQL statement with a sub-select, but it returns 1 if true and 0 if false. """ - db = RamanDB() - db.execute(""" + self.db.execute(""" SELECT MAX(spectralPts) = MIN(spectralPts) as wavelengthsAreAllTheSame FROM @@ -32,131 +36,132 @@ def test03WavelengthsAreUniqueAndCommon(self): where dataType='raw' GROUP BY wavelength) AS something; """) - firstRecord = db.fetchOne() + firstRecord = self.db.fetchOne() self.assertEqual(firstRecord["wavelengthsAreAllTheSame"], 1) def test04WavelengthsProperty(self): - db = RamanDB() - self.assertIsNotNone(db.wavelengths) + self.assertIsNotNone(self.db.wavelengths) def test05FileCount(self): - db = RamanDB() - self.assertIsNotNone(db.getFileCount()) + self.assertIsNotNone(self.db.getFileCount()) def test06FileCountShouldMatchRawSpectraTimesWavelength(self): """ NUmber of points in the spectra database for 'raw' spectra should be #wavelengths x #files """ - db = RamanDB() - rawSpectraCount = db.getFileCount() - wavelengthsCount = len(db.getWavelengths()) + rawSpectraCount = self.db.getFileCount() + wavelengthsCount = len(self.db.getWavelengths()) - db.execute("select count(*) as count from spectra where dataType='raw'") - valueRecord = db.fetchOne() + self.db.execute("select count(*) as count from spectra where dataType='raw'") + valueRecord = self.db.fetchOne() self.assertEqual(valueRecord["count"], rawSpectraCount*wavelengthsCount) def test07FilePaths(self): - db = RamanDB() - self.assertIsNotNone(db.getSpectraPaths()) - self.assertEqual(db.getFileCount(), len(db.getSpectraPaths())) + self.assertIsNotNone(self.db.getSpectraPaths()) + self.assertEqual(self.db.getFileCount(), len(self.db.getSpectraPaths())) def test08GetWhiteSpectra(self): - db = RamanDB() - db.execute("select count(*) as count from files inner join wines on wines.wineId = files.wineId where wines.color = 'white'") - firstRecord = db.fetchOne() + self.db.execute("select count(*) as count from files inner join wines on wines.wineId = files.wineId where wines.color = 'white'") + firstRecord = self.db.fetchOne() whiteWineFileCount = firstRecord["count"] - matrix, labels = db.getSpectraWithId(dataType='raw', color='white') + matrix, labels = self.db.getSpectraWithId(dataType='raw', color='white') self.assertIsNotNone(matrix) - self.assertEqual(matrix.shape, (len(db.wavelengths), whiteWineFileCount)) + self.assertEqual(matrix.shape, (len(self.db.wavelengths), whiteWineFileCount)) def test09GetRedSpectra(self): - db = RamanDB() - db.execute("select count(*) as count from files inner join wines on wines.wineId = files.wineId where wines.color = 'red'") - firstRecord = db.fetchOne() + self.db.execute("select count(*) as count from files inner join wines on wines.wineId = files.wineId where wines.color = 'red'") + firstRecord = self.db.fetchOne() redWineFileCount = firstRecord["count"] - matrix, labels = db.getSpectraWithId(dataType='raw', color='red') + matrix, labels = self.db.getSpectraWithId(dataType='raw', color='red') self.assertIsNotNone(matrix) - self.assertEqual(matrix.shape, (len(db.wavelengths), redWineFileCount)) + self.assertEqual(matrix.shape, (len(self.db.wavelengths), redWineFileCount)) - def testReadQEProFile(self): - db = RamanDB() - wavelengths, intensities = db.readQEProFile('originaldata/Q100.txt') + def test10ReadQEProFile(self): + wavelengths, intensities = self.db.readQEProFile('originaldata/Q100.txt') self.assertEqual(len(intensities), 1044) - def testInsertAllSpectra(self): - db = RamanDB() + def test11InsertAllSpectra(self): dataDir = 'originaldata' filenames = os.listdir(dataDir) filePaths = [] for filename in filenames: filePaths.append(os.path.join(dataDir, filename)) - inserted = db.insertSpectralDataFromFiles(filePaths) + inserted = self.db.insertSpectralDataFromFiles(filePaths) if inserted == 0: self.skipTest("Nothing was inserted") - def testExecuteCount(self): - db = RamanDB() - self.assertTrue(db.executeCount("select count(*) as count from spectra") > 0) + def test12ExecuteCount(self): + self.assertTrue(self.db.executeCount("select count(*) as count from spectra") > 0) - def testInsertAllCorrectedSpectra(self): - db = RamanDB() - db.execute("select distinct spectrumId from spectra where spectrumId not in (select spectrumId from spectra where dataType='fluorescence-corrected')") - records = db.fetchAll() + def test13InsertAllCorrectedSpectra(self): + self.db.execute("select distinct spectrumId from spectra where dataType='raw' and spectrumId not in (select spectrumId from spectra where dataType='fluorescence-corrected')") + records = self.db.fetchAll() if len(records) == 0: self.skipTest("All corrected spectra exist in the database") for record in records: spectrumId = record["spectrumId"] - spectrum, labels = db.getSpectrum(dataType='raw', spectrumId=spectrumId) + spectrum, labels = self.db.getSpectrum(dataType='raw', spectrumId=spectrumId) degree = 100 - correctedSpectrum = db.subtractFluorescence(spectrum, polynomialDegree=degree) + correctedSpectrum = self.db.subtractFluorescence(spectrum, polynomialDegree=degree) print(spectrumId) match = re.search(r"(\d+)-(\d+)", spectrumId) wineId = int(match.group(1)) sampleId = int(match.group(2)) - db.insertSpectralData(db.wavelengths, correctedSpectrum[:,:], 'fluorescence-corrected', wineId, sampleId, 'BaselineRemoval-nomask-degree{0}'.format(degree)) + self.db.insertSpectralData(self.db.wavelengths, correctedSpectrum[:,:], 'fluorescence-corrected', wineId, sampleId, 'BaselineRemoval-nomask-degree{0}'.format(degree)) @unittest.skip("done") - def testBuildWineIdAndSampleId(self): - db.execute('update files set sampleId=substr(path,18,2) where path like "%\_%" ESCAPE "\"') + def test14BuildWineIdAndSampleId(self): + self.db.execute('update files set sampleId=substr(path,18,2) where path like "%\_%" ESCAPE "\"') - def testWinesSummary(self): - db = RamanDB() - wineSummary = db.getWinesSummary() + def test15WinesSummary(self): + wineSummary = self.db.getWinesSummary() totalNumberOfSpectra = sum([ wine["nSamples"] for wine in wineSummary]) - db.execute("select count(*) as count from spectra where dataType='raw'") - valueRecord = db.fetchOne() - self.assertEqual(valueRecord["count"], totalNumberOfSpectra*len(db.getWavelengths())) + self.db.execute("select count(*) as count from spectra where dataType='raw'") + valueRecord = self.db.fetchOne() + self.assertEqual(valueRecord["count"], totalNumberOfSpectra*len(self.db.getWavelengths())) - def testSingleSpectrum(self): - db = RamanDB() - db.execute("select wavelength, intensity from spectra where spectrumId = '0002-0001'") - records = db.fetchAll() + def test16SingleSpectrum(self): + self.db.execute("select wavelength, intensity from spectra where spectrumId = '0002-0001'") + records = self.db.fetchAll() for record in records: print(record) - def testDataTypes(self): - db = RamanDB() - self.assertTrue('raw' in db.getDataTypes()) + def test17DataTypes(self): + self.assertTrue('raw' in self.db.getDataTypes()) - def testGetSpectraValidTypeFluorescence(self): - db = RamanDB() - if 'fluorescence-corrected' in db.getDataTypes(): - spectra, spectrumIds = db.getSpectraWithId(dataType='fluorescence-corrected') + def test18GetSpectraValidTypeFluorescence(self): + if 'fluorescence-corrected' in self.db.getDataTypes(): + spectra, spectrumIds = self.db.getSpectraWithId(dataType='fluorescence-corrected') self.assertIsNotNone(spectra) else: self.skipTest("No background-corrected spectra in database") - def testGetSpectraInvalidType(self): - db = RamanDB() + def test19GetSpectraInvalidType(self): with self.assertRaises(ValueError): - spectra = db.getSpectraWithId(dataType='unknown') + spectra = self.db.getSpectraWithId(dataType='unknown') + + def test20DatabaseMySQLLocal(self): + db = RamanDB("mysql://127.0.0.1/root@raman") + self.assertIsNotNone(db) + self.assertIsNotNone(db.getWavelengths()) + + def test21Wavenumbers(self): + print(self.db.wavenumbers) + + def test22Mask(self): + print(sum(self.db.wavelengthMask)) + maskRange = [] + for i, mask in enumerate(self.db.wavelengthMask): + if mask: + maskRange.append(i) + print(self.db.wavelengths[maskRange]) if __name__ == "__main__": unittest.main() \ No newline at end of file From 0838ec9afbadbc91895ba11b78c34e569447491b Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 30 Mar 2022 00:11:58 -0400 Subject: [PATCH 19/22] test13 is hanging over ssh. weird bug, avoided by changing the SQL statement --- testDatabase.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/testDatabase.py b/testDatabase.py index 11af9f6..6a45e47 100644 --- a/testDatabase.py +++ b/testDatabase.py @@ -8,8 +8,8 @@ class TestRamanDatabase(unittest.TestCase): def setUp(self): - self.db = RamanDB() - # self.db = RamanDB("mysql://127.0.0.1/root@raman") + # self.db = RamanDB() + self.db = RamanDB("mysql://127.0.0.1/root@raman") self.assertIsNotNone(self.db) @unittest.skip("Now in setUp") @@ -99,7 +99,7 @@ def test12ExecuteCount(self): self.assertTrue(self.db.executeCount("select count(*) as count from spectra") > 0) def test13InsertAllCorrectedSpectra(self): - self.db.execute("select distinct spectrumId from spectra where dataType='raw' and spectrumId not in (select spectrumId from spectra where dataType='fluorescence-corrected')") + self.db.execute("select distinct spectrumId from spectra where spectrumId not in (select spectrumId from spectra where dataType='fluorescence-corrected')") records = self.db.fetchAll() if len(records) == 0: self.skipTest("All corrected spectra exist in the database") @@ -107,6 +107,8 @@ def test13InsertAllCorrectedSpectra(self): for record in records: spectrumId = record["spectrumId"] spectrum, labels = self.db.getSpectrum(dataType='raw', spectrumId=spectrumId) + if spectrum is None: + continue degree = 100 correctedSpectrum = self.db.subtractFluorescence(spectrum, polynomialDegree=degree) print(spectrumId) From 7be19b0da02929bc52b92a417d5221d0c9a0add4 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 30 Mar 2022 08:08:46 -0400 Subject: [PATCH 20/22] Update testDatabase.py --- testDatabase.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/testDatabase.py b/testDatabase.py index 6a45e47..7bd2af3 100644 --- a/testDatabase.py +++ b/testDatabase.py @@ -8,8 +8,8 @@ class TestRamanDatabase(unittest.TestCase): def setUp(self): - # self.db = RamanDB() - self.db = RamanDB("mysql://127.0.0.1/root@raman") + self.db = RamanDB() + # self.db = RamanDB("mysql://127.0.0.1/root@raman") self.assertIsNotNone(self.db) @unittest.skip("Now in setUp") From f8274a25244c10d4c0576cc9824ffa277352494f Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 30 Mar 2022 08:22:28 -0400 Subject: [PATCH 21/22] Removed test for local MySQL database (only on dccote's computer) --- testDatabase.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/testDatabase.py b/testDatabase.py index 7bd2af3..9678632 100644 --- a/testDatabase.py +++ b/testDatabase.py @@ -149,13 +149,14 @@ def test19GetSpectraInvalidType(self): with self.assertRaises(ValueError): spectra = self.db.getSpectraWithId(dataType='unknown') + @unittest.skip("Only on dccote's computer") def test20DatabaseMySQLLocal(self): db = RamanDB("mysql://127.0.0.1/root@raman") self.assertIsNotNone(db) self.assertIsNotNone(db.getWavelengths()) def test21Wavenumbers(self): - print(self.db.wavenumbers) + self.assertIsNotNone(self.db.wavenumbers) def test22Mask(self): print(sum(self.db.wavelengthMask)) From e6540b4117ef5ad0afc7418aae3fc44e5995d7f9 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 30 Mar 2022 08:34:43 -0400 Subject: [PATCH 22/22] BUg fix: rename function --- testVino.py | 2 +- vino.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/testVino.py b/testVino.py index 934185c..6d54842 100644 --- a/testVino.py +++ b/testVino.py @@ -64,7 +64,7 @@ def testColormap(self): vino = vinoPCA() cm = vino.getColorMap() self.assertIsNotNone(cm) - spectra, labels = vino.db.getIntensities() + spectra, labels = vino.db.getSpectraWithId() self.assertEqual(len(cm), len(labels)) # def testOneSpectrum(self): diff --git a/vino.py b/vino.py index 04a3db7..01db57c 100644 --- a/vino.py +++ b/vino.py @@ -18,7 +18,7 @@ def __init__(self): self.wavelengths = self.db.getWavelengths() - self.wavelengthMask = range(200, 1000) + self.wavelengthMask = self.db.wavelengthMask self.data = self.data[self.wavelengthMask, :] self.wavelengths = self.wavelengths[self.wavelengthMask]