diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/PyVino.iml b/.idea/PyVino.iml new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ b/.idea/PyVino.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/dataSources.xml b/.idea/dataSources.xml new file mode 100644 index 0000000..5b2b08f --- /dev/null +++ b/.idea/dataSources.xml @@ -0,0 +1,17 @@ + + + + + sqlite.xerial + true + org.sqlite.JDBC + jdbc:sqlite:$PROJECT_DIR$/raman.db + $ProjectFileDir$ + + + file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.34.0/sqlite-jdbc-3.34.0.jar + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..1b700c1 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,8 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..d1e22ec --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..d2c2de9 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/ramandb.py b/ramandb.py index 52ed268..3725b18 100644 --- a/ramandb.py +++ b/ramandb.py @@ -1,32 +1,85 @@ +import dcclab from dcclab.database import * import numpy as np import requests +from BaselineRemoval import BaselineRemoval +import re -class RamanDB(Database): - url = 'https://www.dropbox.com/s/peowchyj7xyib4w/raman.db?dl=1' - def __init__(self, writePermission=False): +class RamanDB(dcclab.database.Database): + def __init__(self, databaseURL = None): """ - Creates the database object for Raman spectra. + The Database is a MySQL database on cafeine called `raman`. """ - - self.databasePath = "raman.db" - if not os.path.exists(self.databasePath): - print("The raman.db file is not available. Atttempting to download from {0}".format(self.url)) - filename = self.downloadDatabase() - if os.path.exists(filename) and not os.path.exists(self.databasePath): - os.rename(filename, self.databasePath) - print("Success. File has been renamed raman.db") + if databaseURL is None: + databaseURL = "mysql://dcclab@cafeine2.crulrg.ulaval.ca/dcclab@raman" self._wavelengths = None + self._wavelengthMask = None self.progressStart = None - super().__init__(self.databasePath, writePermission=writePermission) + self.constraints = [] + self.pumpWavelengthInNm = 785 + super().__init__(databaseURL) + + if dcclab.__version__ < "1.0.3": + print("You should update PyDCCLab with `pip install dcclab` to get the latest version.") + + def showHelp(self): + print(""" + All wines obtained from the group are in this database. Things to know: + * Wines are identified with a "wineId" that is A,B,C, .... AA, AB, AC, .... etc. + * Each wine has a number a spectrum acquisitions associated with it (typically 30, 60, etc...) + * When a Raman spectrum is acquired + + """) + + def execute(self, statement, bindings=None): + """ + This function with "bindings" is necessary to handle binary data: it cannot be inserted with a string statement. + The bindings are explained here: https://zetcode.com/db/sqlitepythontutorial/ and are similar to .format() + but are handled properly by the sqlite3 module instead of a python string. Without it, binary data + is inserted as a string, which is not good. + + See insertFileContentIntoSources() for an example. + + """ + if bindings is None: + super().execute(statement) # Call the original function from dcclab.database + else: + self.cursor.execute(statement, bindings) + + def executeCount(self, statement, bindings=None): + """ + This function with "bindings" is necessary to handle binary data: it cannot be inserted with a string statement. + The bindings are explained here: https://zetcode.com/db/sqlitepythontutorial/ and are similar to .format() + but are handled properly by the sqlite3 module instead of a python string. Without it, binary data + is inserted as a string, which is not good. + + See insertFileContentIntoSources() for an example. + + """ + self.execute(statement, bindings) + singleRecord = self.fetchOne() + keys = list(singleRecord.keys()) + if len(keys) == 1: + return int(singleRecord[keys[0]]) + else: + return None - def downloadDatabase(self): - r = requests.get(self.url, allow_redirects=True) - filename = "raman-download.db" - with open(filename, 'wb') as file: - file.write(r.content) - return filename + def parseURL(self, url): + #mysql://sshusername:sshpassword@cafeine2.crulrg.ulaval.ca/mysqlusername:mysqlpassword@questions + if dcclab.__version__ >= "1.0.4": + print("No need to patch parseURL in this dcclab version") + + match = re.search("(mysql)://(.*?)@?([^@]+?)/(.*?)@(.+)", url) + if match is not None: + protocol = Engine.mysql + sshuser = match.group(2) + host = match.group(3) + mysqluser = match.group(4) + database = match.group(5) + return (protocol, sshuser, host, mysqluser, database) + else: + return (Engine.sqlite3, None, "127.0.0.1", None, url) @property def wavelengths(self): @@ -35,8 +88,96 @@ def wavelengths(self): return self._wavelengths + @property + def wavenumbers(self): + return 1e7*(1.0/self.pumpWavelengthInNm - 1.0/self.wavelengths) + + @property + def wavelengthMask(self): + if self._wavelengthMask is None: + self._wavelengthMask = self.getWavelengthMask() + + return self._wavelengthMask + + def getWavelengthMask(self): + self.execute(r"select distinct(wavelength), intensity from spectra where dataType='mask-wine' order by wavelength") + rows = self.fetchAll() + nTotal = len(rows) + + if nTotal != 0: + mask = np.zeros(shape=(nTotal),dtype=bool) + for i,row in enumerate(rows): + mask[i] = bool(row['intensity']) + else: + mask = np.zeros(shape=(len(self.wavelengths))) + for i in range(200, 1000): + mask[i] = True + self.insertSpectralData(wavelengths=self.wavelengths, intensities=mask, dataType='mask-wine', wineId=None, sampleId=None, algorithm='BaselineRemoval') + + return mask + + def readQEProFile(self, filePath): + # text_file = open(filePath, "br") + # hash = hashlib.md5(text_file.read()).hexdigest() + # text_file.close() + + with open(filePath, "r") as text_file: + lines = text_file.read().splitlines() + + wavelengths = [] + intensities = [] + for line in lines: + match = re.match(r'^\s*(\d+\.?\d+)\s+(-?\d*\.?\d*)', line) + if match is not None: + intensity = match.group(2) + wavelength = match.group(1) + wavelengths.append(wavelength) + intensities.append(intensity) + else: + pass + # print("Line does not match: {0}".format(line)) + return wavelengths, intensities + + def insertSpectralDataFromFiles(self, filePaths, dataType='raw'): + inserted = 0 + for filePath in filePaths: + match = re.search(r'([A-Z]{1,2})_?(\d{1,3})\.', filePath) + if match is None: + raise ValueError("The file does not appear to have a valid name: {0}".format(filePath)) + + wineId = int(ord(match.group(1))-ord('A')) + sampleId = int(match.group(2)) + spectrumId = "{0:04}-{1:04d}".format(wineId, sampleId) + + wavelengths, intensities = self.readQEProFile(filePath) + try: + self.insertSpectralData(wavelengths, intensities, dataType, wineId, sampleId) + print("Inserted {0}".format(filePath)) + inserted += 1 + except ValueError as err: + print(err) + + return inserted + + def insertSpectralData(self, wavelengths, intensities, dataType, wineId, sampleId, algorithm=None): + if wineId is None or sampleId is None: + spectrumId = None + else: + spectrumId = "{0:04}-{1:04d}".format(wineId, sampleId) + + count = self.executeCount('select count(*) as count from spectra where spectrumId = "{0}" and dataType = "{1}"'.format(spectrumId, dataType)) + if count != 0 : + raise ValueError("Spectrum {0} already exists with dataType='{1}'".format(spectrumId, dataType)) + + values = [] + for x,y in zip(wavelengths, intensities): + values.append("({0}, {1}, '{2}', '{3}', '{4}', '{5}', now(), '{6}') ".format(x,float(y), dataType, wineId, sampleId, spectrumId, algorithm)) + + bigStatement = "insert into spectra (wavelength, intensity, dataType, wineId, sampleId, spectrumId, dateAdded, algorithm) values" + ','.join(values) + self.execute( bigStatement) + def getWavelengths(self): - self.execute(r"select distinct(wavelength) from spectra order by wavelength") + self.execute(r"select distinct(wavelength) from spectra where dataType='raw' order by wavelength") rows = self.fetchAll() nTotal = len(rows) @@ -46,15 +187,43 @@ def getWavelengths(self): return wavelengths + def getDataTypes(self): + self.execute('select distinct dataType from spectra') + rows = self.fetchAll() + dataTypes = [] + for row in rows: + dataTypes.append(row["dataType"]) + + return dataTypes - def getCountFiles(self): + def getWineIds(self): + self.execute(r"select count(*) as count, wineId as id from files group by wineId order by wineId;") + rows = self.fetchAll() + identifiers = {} + for row in rows: + id = row["id"] + nSamples = row["count"] + identifiers[id] = nSamples + return identifiers + + def getWinesSummary(self): + # mysql.connector.errors.ProgrammingError: 1055( + # 42000): Expression # 4 of SELECT list is not in GROUP BY clause and contains nonaggregated column 'raman.wines.dateOpened' which is not functionally dependent on columns in GROUP BY clause; this is incompatible with sql_mode=only_full_group_by + + self.execute(r"select files.wineId, count(*) as nSamples, wines.* from files inner join wines on wines.wineId = files.wineId group by files.wineId order by files.wineId") + rows = self.fetchAll() + wines = [] + for row in rows: + wines.append(dict(row)) + return wines + + def getFileCount(self): self.execute(r"select count(*) as count from files") rows = self.fetchAll() if rows is None: return 0 return rows[0]["count"] - def getSpectraPaths(self): self.execute("select path from files order by path") rows = self.fetchAll() @@ -63,37 +232,118 @@ def getSpectraPaths(self): paths.append(row['path']) return paths - def getIntensities(self, limit=None): + def getSpectrum(self, dataType, spectrumId): + whereConstraints = [] + possibleDataTypes = self.getDataTypes() + + if dataType is None: + dataType = 'raw' + if dataType not in possibleDataTypes: + raise ValueError('Possible dataTypes are {0}'.format(possibleDataTypes)) + whereConstraints.append("dataType = '{0}'".format(dataType)) + + whereConstraints.append("spectrumId = '{0}'".format(spectrumId)) + + if len(whereConstraints) != 0: + whereClause = "where " + " and ".join(whereConstraints) + else: + whereClause = "" + stmnt = """ - select wavelength, intensity, files.path from spectra - inner join files on files.fid = spectra.fid - order by files.path, wavelength """ + select wavelength, intensity, spectra.spectrumId from spectra + {0} + order by spectra.spectrumId, spectra.wavelength """.format(whereClause ) wavelengths = self.getWavelengths() nWavelengths = len(wavelengths) + self.execute(stmnt) + + rows = [] + row = self.fetchOne() + while row is not None: + rows.append(row) + if len(rows) % 100 == 0: + print(".", end='') + row = self.fetchOne() + + nSamples = len(rows)//nWavelengths + if nSamples == 0: + return None, None + + spectra = np.zeros(shape=(nWavelengths, nSamples)) + spectrumIdentifiers = [""]*nSamples + for i,row in enumerate(rows): + spectra[i%nWavelengths, i//nWavelengths] = float(row['intensity']) + spectrumIdentifiers[i//nWavelengths] = row['spectrumId'] + + return spectra, spectrumIdentifiers + + def getSpectraWithId(self, dataType=None, color=None, limit=None): + whereConstraints = [] + possibleDataTypes = self.getDataTypes() + + if dataType is None: + dataType = 'raw' + if dataType not in possibleDataTypes: + raise ValueError('Possible dataTypes are {0}'.format(possibleDataTypes)) + whereConstraints.append("dataType = '{0}'".format(dataType)) + + if color is not None: + whereConstraints.append(' wineId in (select wineId from wines where color="{0}") '.format(color)) + + if len(whereConstraints) != 0: + whereClause = "where " + " and ".join(whereConstraints) + else: + whereClause = "" + + stmnt = """ + select wavelength, intensity, spectra.spectrumId + from spectra + {0} + order by spectra.spectrumId, spectra.wavelength """.format(whereClause ) + + wavelengths = self.wavelengths + nWavelengths = len(wavelengths) + if limit is not None: stmnt += " limit {0}".format(limit*nWavelengths) self.execute(stmnt) - rows = list(self.fetchAll()) - if rows is None: - return None - + rows = [] + row = self.fetchOne() + while row is not None: + rows.append(row) + if len(rows) % 100 == 0: + print(".", end='') + row = self.fetchOne() + nSamples = len(rows)//nWavelengths if nSamples == 0: return None spectra = np.zeros(shape=(nWavelengths, nSamples)) - wineIdentifiers = [""]*nSamples + spectrumIdentifiers = [""]*nSamples for i,row in enumerate(rows): spectra[i%nWavelengths, i//nWavelengths] = float(row['intensity']) - match = re.search(r"([A-Z]+)_?\d+.txt", row["path"]) - if match is not None: - wineIdentifiers[i//nWavelengths] = match.group(1) + spectrumIdentifiers[i//nWavelengths] = row['spectrumId'] + + return spectra, spectrumIdentifiers + + def subtractFluorescence(self, rawSpectra, polynomialDegree=5): + + """ + Remove fluorescence background from the data. + :return: A corrected data without the background. + """ + + correctedSpectra = np.empty_like(rawSpectra) + for i in range(rawSpectra.shape[1]): + spectrum = rawSpectra[:, i] + correctedSpectra[:, i] = BaselineRemoval(spectrum).IModPoly(polynomialDegree) - return spectra, wineIdentifiers + return correctedSpectra def showProgressBar(self, iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"): """ diff --git a/testDatabase.py b/testDatabase.py index 2d2bdca..9678632 100644 --- a/testDatabase.py +++ b/testDatabase.py @@ -4,67 +4,167 @@ import os from ramandb import RamanDB import requests +import re -class TestBuildDatabase(unittest.TestCase): - def testDatabase(self): - db = RamanDB() - self.assertIsNotNone(db) - self.assertTrue(os.path.exists(db.databasePath)) +class TestRamanDatabase(unittest.TestCase): + def setUp(self): + self.db = RamanDB() + # self.db = RamanDB("mysql://127.0.0.1/root@raman") + self.assertIsNotNone(self.db) - def testWavelengths(self): - db = RamanDB() - self.assertIsNotNone(db.getWavelengths()) - self.assertEqual(len(db.getWavelengths()), 1044) - - def testWavelengthsProperty(self): - db = RamanDB() - self.assertIsNotNone(db.wavelengths) - self.assertEqual(len(db.wavelengths), 1044) - - def testFileCount(self): - db = RamanDB() - self.assertIsNotNone(db.getCountFiles()) - self.assertEqual(db.getCountFiles(), 709) - - def testFilePaths(self): - db = RamanDB() - self.assertIsNotNone(db.getSpectraPaths()) - self.assertEqual(db.getCountFiles(), len(db.getSpectraPaths())) - - def testGetIntensity(self): - db = RamanDB() - matrix, labels = db.getIntensities() + @unittest.skip("Now in setUp") + def test01Database(self): + self.db = RamanDB() + self.assertIsNotNone(self.db) + + def test02Wavelengths(self): + self.assertIsNotNone(self.db.getWavelengths()) + + def test03WavelengthsAreUniqueAndCommon(self): + """ + Check that all RAW spectra have the same number of wavelengths. + This is a complex SQL statement with a sub-select, but it returns 1 if true and 0 if false. + """ + self.db.execute(""" + SELECT + MAX(spectralPts) = MIN(spectralPts) as wavelengthsAreAllTheSame + FROM + (SELECT + COUNT(wavelength) AS spectralPts + FROM + spectra + where dataType='raw' + GROUP BY wavelength) AS something; + """) + firstRecord = self.db.fetchOne() + self.assertEqual(firstRecord["wavelengthsAreAllTheSame"], 1) + + def test04WavelengthsProperty(self): + self.assertIsNotNone(self.db.wavelengths) + + def test05FileCount(self): + self.assertIsNotNone(self.db.getFileCount()) + + def test06FileCountShouldMatchRawSpectraTimesWavelength(self): + """ + NUmber of points in the spectra database for 'raw' spectra should be #wavelengths x #files + """ + rawSpectraCount = self.db.getFileCount() + wavelengthsCount = len(self.db.getWavelengths()) + + self.db.execute("select count(*) as count from spectra where dataType='raw'") + valueRecord = self.db.fetchOne() + self.assertEqual(valueRecord["count"], rawSpectraCount*wavelengthsCount) + + def test07FilePaths(self): + self.assertIsNotNone(self.db.getSpectraPaths()) + self.assertEqual(self.db.getFileCount(), len(self.db.getSpectraPaths())) + + def test08GetWhiteSpectra(self): + self.db.execute("select count(*) as count from files inner join wines on wines.wineId = files.wineId where wines.color = 'white'") + firstRecord = self.db.fetchOne() + whiteWineFileCount = firstRecord["count"] + + matrix, labels = self.db.getSpectraWithId(dataType='raw', color='white') + self.assertIsNotNone(matrix) + + self.assertEqual(matrix.shape, (len(self.db.wavelengths), whiteWineFileCount)) + + def test09GetRedSpectra(self): + self.db.execute("select count(*) as count from files inner join wines on wines.wineId = files.wineId where wines.color = 'red'") + firstRecord = self.db.fetchOne() + redWineFileCount = firstRecord["count"] + + matrix, labels = self.db.getSpectraWithId(dataType='raw', color='red') self.assertIsNotNone(matrix) - self.assertEqual(matrix.shape, (len(db.wavelengths), db.getCountFiles())) - - @unittest.skip("Ok, tested") - def testDownload(self): - url = 'https://www.dropbox.com/s/2st0sv7jpii6dz8/raman.db?dl=1' - r = requests.get(url, allow_redirects=True) - with open('test.db', 'wb') as file: - file.write(r.content) - - @unittest.skip("Ok, tested") - def testDownload(self): - db = RamanDB() - filename = db.downloadDatabase() - self.assertTrue(os.path.exists(filename)) - os.remove(filename) - - @unittest.skip("Done, no need to redo.") - def testAddFileIdToDatabase(self): - db = RamanDB(writePermission=True) - db.execute("select * from files order by path") - records = db.fetchAll() - for i, record in enumerate(records): - db.execute("update files set fid={0} where md5='{1}'".format(i, record["md5"])) - - db.execute("select spectra.md5, files.fid from spectra inner join files on files.md5 = spectra.md5") - records = db.fetchAll() - for i, record in enumerate(records): - statement = "update spectra set fid={0} where md5='{1}'".format(record["fid"], record["md5"]) - db.execute(statement) + self.assertEqual(matrix.shape, (len(self.db.wavelengths), redWineFileCount)) + + def test10ReadQEProFile(self): + wavelengths, intensities = self.db.readQEProFile('originaldata/Q100.txt') + self.assertEqual(len(intensities), 1044) + + def test11InsertAllSpectra(self): + dataDir = 'originaldata' + filenames = os.listdir(dataDir) + filePaths = [] + for filename in filenames: + filePaths.append(os.path.join(dataDir, filename)) + + inserted = self.db.insertSpectralDataFromFiles(filePaths) + if inserted == 0: + self.skipTest("Nothing was inserted") + + def test12ExecuteCount(self): + self.assertTrue(self.db.executeCount("select count(*) as count from spectra") > 0) + + def test13InsertAllCorrectedSpectra(self): + self.db.execute("select distinct spectrumId from spectra where spectrumId not in (select spectrumId from spectra where dataType='fluorescence-corrected')") + records = self.db.fetchAll() + if len(records) == 0: + self.skipTest("All corrected spectra exist in the database") + + for record in records: + spectrumId = record["spectrumId"] + spectrum, labels = self.db.getSpectrum(dataType='raw', spectrumId=spectrumId) + if spectrum is None: + continue + degree = 100 + correctedSpectrum = self.db.subtractFluorescence(spectrum, polynomialDegree=degree) + print(spectrumId) + match = re.search(r"(\d+)-(\d+)", spectrumId) + wineId = int(match.group(1)) + sampleId = int(match.group(2)) + self.db.insertSpectralData(self.db.wavelengths, correctedSpectrum[:,:], 'fluorescence-corrected', wineId, sampleId, 'BaselineRemoval-nomask-degree{0}'.format(degree)) + + @unittest.skip("done") + def test14BuildWineIdAndSampleId(self): + self.db.execute('update files set sampleId=substr(path,18,2) where path like "%\_%" ESCAPE "\"') + + def test15WinesSummary(self): + wineSummary = self.db.getWinesSummary() + totalNumberOfSpectra = sum([ wine["nSamples"] for wine in wineSummary]) + + self.db.execute("select count(*) as count from spectra where dataType='raw'") + valueRecord = self.db.fetchOne() + self.assertEqual(valueRecord["count"], totalNumberOfSpectra*len(self.db.getWavelengths())) + + def test16SingleSpectrum(self): + self.db.execute("select wavelength, intensity from spectra where spectrumId = '0002-0001'") + records = self.db.fetchAll() + for record in records: + print(record) + + def test17DataTypes(self): + self.assertTrue('raw' in self.db.getDataTypes()) + + def test18GetSpectraValidTypeFluorescence(self): + if 'fluorescence-corrected' in self.db.getDataTypes(): + spectra, spectrumIds = self.db.getSpectraWithId(dataType='fluorescence-corrected') + self.assertIsNotNone(spectra) + else: + self.skipTest("No background-corrected spectra in database") + + def test19GetSpectraInvalidType(self): + with self.assertRaises(ValueError): + spectra = self.db.getSpectraWithId(dataType='unknown') + + @unittest.skip("Only on dccote's computer") + def test20DatabaseMySQLLocal(self): + db = RamanDB("mysql://127.0.0.1/root@raman") + self.assertIsNotNone(db) + self.assertIsNotNone(db.getWavelengths()) + + def test21Wavenumbers(self): + self.assertIsNotNone(self.db.wavenumbers) + + def test22Mask(self): + print(sum(self.db.wavelengthMask)) + maskRange = [] + for i, mask in enumerate(self.db.wavelengthMask): + if mask: + maskRange.append(i) + print(self.db.wavelengths[maskRange]) if __name__ == "__main__": unittest.main() \ No newline at end of file diff --git a/testVino.py b/testVino.py new file mode 100644 index 0000000..6d54842 --- /dev/null +++ b/testVino.py @@ -0,0 +1,81 @@ +import unittest +import numpy as np +from dcclab import Database +import os +from ramandb import RamanDB +import requests +import matplotlib.pyplot as plt +from vino import vinoPCA + +class TestVInoClass(unittest.TestCase): + @unittest.skip("NOt now") + def testInit(self): + iterable = [31, 30, 30, 30, 80, 31, 33, 31, 30, 30, 30, 30, 30, 30, 30, 30, 104, 30, 30] # sans vin blanc parceque ça shit le aspect ratio + total = sum(iterable) + + # Data = np.genfromtxt('/Users/Shooshoo/PycharmProjects/PCA_DCCLab/DataVino_Sorted.csv', delimiter=',') + db = RamanDB() + data, labels = db.getIntensities() + wavelengths = db.getWavelengths() + data = np.cat(wavelengths, wavelengths, data[:,0:total]) + self.assertEqual(data.shape[1], total) + my_Spectrums = vinoPCA(data, iterable) + + self.assertIsNotNone(my_Spectrums) + + def testRemoveFluo(self): + iterable = [31, 30, 30, 30, 80, 31, 33, 31, 30, 30, 30, 30, 30, 30, 30, 30, 104, 30, 30] # sans vin blanc parceque ça shit le aspect ratio + total = sum(iterable) + + # I need to remove this function, I don't have access to the csv file. + # Data = np.genfromtxt('/Users/Shooshoo/PycharmProjects/PCA_DCCLab/DataVino_Sorted.csv', delimiter=',') + # After a bit of playing around: column 0 is not used, column 1 is the wavelengths, then its + # the data + my_Spectrums = vinoPCA() + + self.assertIsNotNone(my_Spectrums) + + my_Spectrums.subtractFluorescence() + + + def testDoPCA(self): + my_Spectrums = vinoPCA() + + self.assertIsNotNone(my_Spectrums) + + my_Spectrums.doPCA(10) + my_Spectrums.showTransformedData3D() + my_Spectrums.showTransformedData2D() + my_Spectrums.showEigenvectors() + + def testvinoPCANoArgument(self): + my_Spectrums = vinoPCA() + self.assertIsNotNone(my_Spectrums) + + my_Spectrums.doPCA(3) + my_Spectrums.showTransformedData3D() + my_Spectrums.showTransformedData2D() + my_Spectrums.showEigenvectors() + + # def testInitDB(self): + # self.assertIsNotNone(vinoPCA().db) + + def testColormap(self): + vino = vinoPCA() + cm = vino.getColorMap() + self.assertIsNotNone(cm) + spectra, labels = vino.db.getSpectraWithId() + self.assertEqual(len(cm), len(labels)) + + # def testOneSpectrum(self): + # vino = vinoPCA() + # spectra, labels = vino.db.getIntensities() + # plt.plot(spectra[:,1]) + # newSpectra = vino.removeFLuo(spectra) + # print(newSpectra) + # # plt.plot(newSpectra) + # # plt.show() + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/vino.py b/vino.py index 2f40e0f..01db57c 100644 --- a/vino.py +++ b/vino.py @@ -4,19 +4,23 @@ from sklearn.decomposition import PCA from scipy import interpolate from BaselineRemoval import BaselineRemoval - +from ramandb import RamanDB class vinoPCA: - def __init__(self, Data, numberOfEachSamples): + def __init__(self): + self.db = RamanDB() + self.constraints = [] + self.data, self.labels = self.db.getSpectraWithId(dataType='raw') + self.correctedData, correctedLabel = self.db.getSpectraWithId(dataType='fluorescence-corrected') + if self.labels != correctedLabel: + raise ValueError('Not all spectra are corrected') - """ - :param Data: The data on wich PCA should be done. - :param colormap: An iterable that contains how many of each samples there is in Data, in the good order. - """ + self.wavelengths = self.db.getWavelengths() - self.Data = Data - self.numberOfEachSamples = numberOfEachSamples + self.wavelengthMask = self.db.wavelengthMask + self.data = self.data[self.wavelengthMask, :] + self.wavelengths = self.wavelengths[self.wavelengthMask] def getColorMap(self): @@ -25,50 +29,32 @@ def getColorMap(self): :return: Return a colormap to visualise different samples on the plot. """ - for i in range(0, len(self.numberOfEachSamples)): - if i == 0: - colormap = np.zeros(self.numberOfEachSamples[0]) - else: - colormap = np.append(colormap, np.ones(self.numberOfEachSamples[i]) *5*i) + uniqueLabelsInOrder = sorted(set(self.labels)) + possibleColorsInOrder = range(len(uniqueLabelsInOrder)) + colors = {} + for identifier, color in zip(uniqueLabelsInOrder, possibleColorsInOrder): + colors[identifier] = color*5 + + colormap = [] + for identifier in self.labels: + colormap.append(colors[identifier]) - return colormap + return np.array(colormap) - def removeFLuo(self, Data): + def subtractFluorescence(self): """ - Remove fluorescence background from the data given. - :param Data: The Data from witch you wish to remove fluo background. - :return: A new set of Data without the background. + Remove fluorescence background from the data. + :return: A corrected data without the background. """ - nm = Data[:, 1] - cm = 1 / (632.8e-9) - 1 / (nm * 1e-9) - size = np.ma.size(Data, 1) - polynomial_degree = 100 - filtered_datas = np.zeros(shape=(800, size - 1)) + polynomial_degree = 5 + correctedSpectra = np.empty_like(self.data) + for i in range(self.data.shape[1]): + spectre = self.data[:, i] + correctedSpectra[:, i] = BaselineRemoval(spectre).IModPoly(polynomial_degree) - # for column in range(2, size): - # y = Data[:, column] - # d = 25 - # f2 = interpolate.interp1d(cm[199:][::d], y[199:][::d], kind='quadratic') - # y = y[200:1000] - f2(cm[200:1000]) - # y = (y - min(y)) / max(y - min(y)) - # filt_datas[:, column - 1] = y - # filt_datas[:, 0] = cm[200:1000] - - for column in range(2, size): - spectre = Data[200:1000, column] - baseObj = BaselineRemoval(spectre) - values = baseObj.IModPoly(polynomial_degree) - # values = values - min(values) # Si tu normalises, tu perds les composants communs (Alcool particulèrement) - # values = values/max(values) # tu perds aussi le degrés de présence (Plus ou moins bouchonné ?) - # Si tu normalises pas, tu favorises les composants communs présents à - # différents degrés (Plus ou moins d'alcool). Donc tester avec et sans? - filtered_datas[:, column - 1] = values - - filtered_datas[:, 0] = Data[200:1000, 1] - - return filtered_datas + return correctedSpectra def doPCA(self, n:int): @@ -77,11 +63,9 @@ def doPCA(self, n:int): :param n: number of componants to get from the PCA :return: Returns nothing. Just creats an array of the transformed datas into the new vector space """ - - new_Datas = self.removeFLuo(self.Data) - new_Datas = np.transpose(new_Datas) - self.X_PCA = PCA(n_components=n) - self.X_reduced = self.X_PCA.fit_transform(new_Datas[1:, :]) + self.pca = PCA(n_components=n) + correctedData = self.subtractFluorescence() + self.X_reduced = self.pca.fit_transform(correctedData.T) def showTransformedData3D(self): @@ -94,9 +78,9 @@ def showTransformedData3D(self): fig = plt.figure(1, figsize=(8, 6)) ax = Axes3D(fig, elev=-150, azim=110) ax.scatter( - self.X_reduced[:700, 0], - self.X_reduced[:700, 1], - self.X_reduced[:700, 2], + self.X_reduced[:, 0], + self.X_reduced[:, 1], + self.X_reduced[:, 2], c=self.getColorMap(), cmap='nipy_spectral', s=10) @@ -118,7 +102,7 @@ def showTransformedData2D(self): plt.clf() plt.figure(2) - plt.scatter(self.X_reduced[:700, 0], self.X_reduced[:700, 1], c=self.getColorMap(), cmap='nipy_spectral', s=10) + plt.scatter(self.X_reduced[:, 0], self.X_reduced[:, 1], c=self.getColorMap(), cmap='nipy_spectral', s=10) plt.title('First two PCA directions') plt.xlabel('1st eigenvector') plt.ylabel('2nd eigenvector') @@ -138,7 +122,7 @@ def getAllEigenvectors(self): :return: an array of n eigenvector """ - return self.X_PCA.components_.transpose() + return self.pca.components_.transpose() def showEigenvectors(self): @@ -148,13 +132,13 @@ def showEigenvectors(self): """ plt.figure(3) plt.title('1st eigenvector') - plt.plot(self.X_PCA.components_.transpose()[:, 0]) + plt.plot(self.pca.components_.transpose()[:, 0]) plt.figure(4) plt.title('2nd eigenvector') - plt.plot(self.X_PCA.components_.transpose()[:, 1]) + plt.plot(self.pca.components_.transpose()[:, 1]) plt.figure(5) plt.title('3rd eigenvector') - plt.plot(self.X_PCA.components_.transpose()[:, 2]) + plt.plot(self.pca.components_.transpose()[:, 2]) plt.show() def getTransformedDatas(self): @@ -173,7 +157,7 @@ def getScreeValues(self): :return: array of the scree values, from most important to least """ - return self.X_PCA.explained_variance_ratio_ + return self.pca.explained_variance_ratio_ def plotScreeValues(self): diff --git a/wines.txt b/wines.txt new file mode 100644 index 0000000..3b19938 --- /dev/null +++ b/wines.txt @@ -0,0 +1,27 @@ +A 2022/01/12 Wine Sirius Bordeaux 2018 https://www.saq.com/en/223537 VPN France Merlot, Cabernet Sauvignon 2.2 red 13 +B 2022/01/12 Wine Ménage à Trois 2019 https://www.saq.com/en/10709152 VPN United States Cabernet Sauvignon 4.3 red 13.5 +C 2022/01/22 Wine Woodbridge by Robert Mondavi https://www.saq.com/en/48611 VPN United States Cabernet Sauvignon 7.3 red 13.5 +D 2022/01/28 Wine Les Jamelles Pinot Noir Pays d'Oc https://www.saq.com/en/10802904 VPN France point noir 4 red 13 +E 2022/01/27 Wine Monasterio de las Vinas https://www.saq.com/en/854422 VPN Spain 70% Garnacha, 20% Tempranillo, 10% Carinena 2.1 red 13.5 +F 2022/02/05 Wine Revolution https://www.saq.com/en/12166892 EP United States Ruby cabernet 50 %, Carignan 32 %, Syrah 18 % 10 red 13.5 +G 2022/02/12 Wine Milhistoraise https://www.saq.com/en/13794111 EP Spain Grenache 1.7 red 14 +H 2022/02/13 Wine Wallaroo Trail Shiraz https://www.saq.com/en/12498459 EP Australia Shiraz 85 %, Cabernet sauvignon 10 %, Petit verdot 5 % 11 red 13.5 +I 2022/02/13 Wine Toro loco https://futailles.com/en/products/wine/red/toro-loco EP Spain Tempranillo 0 red 12.5 +J 2022/02/13 Wine Cantini https://vinstriani.com/produits/cantini-rouge.html EP Italy Sangiovese, Montepulciano, and Cabernet Sauvignon - red 12 +K 2022/02/13 Wine Nicolas laloux https://www.vinsenepicerie.com/en/nicolas-laloux-1/ EP Ontario.Canada Cabernet Sauvignon - red 12.5 +L 2022/02/13 Wine smoky bay SHIRAZ https://www.lcbo.com/webapp/wcs/stores/servlet/en/lcbo/red-wine-14001/smoky-bay-shiraz-17650#.YguvavXMIUo EP Australia Shiraz 10 red 13 +M 2022/02/13 Wine Dolce Venti https://futailles.com/en/products/wine/red/dolce-venti EP Italy Merlot - red 11.5 +N 2022/02/13 Wine Aroma mi Amore https://vinsarista.com/en/produit/wines/aroma-mi-amore/aroma-mi-amore-red-wine/ EP Italy Refosco - red 14.5 +O 2022/02/19 Wine Sonho Aragonez https://www.vivino.com/CA/en/sonho-aragonez/w/5905886 EP Portugal Aragonez red 12.5 +P 2022/02/27 Wine Double vie https://vinsarista.com/en/produit/wines/double-vie/red-wine/ EP Canada red 12 +Q 2022/02/28 Wine Danza https://www.iga.net/en/product/wineargentinian-red-bonarda/00000_000000082424300222 EP Argentina Douce noir red 13.7 +R 2022/02/23 Wine bu https://www.iga.net/en/product/winered-rosso-terre-sicilaine-bio-it/00000_000000005604913702 EP Italy Nero d'Avola 70% + Merlot 20% + Syrah 10% red 12.5 +S 2022/02/24 Wine Croix d'Or https://futailles.com/en/products/wine/red/croix-dor EP Moldavie pinot noir red 12.5 +T 2022/02/18 Wine AUFKELLEREIEN https://www.iga.net/fr/produit/vin-blancallemagne---fruite-et-doux-9--alcool---18-ans--/00000_000000005604980687 AR Allemagne white 9 +U 2022/02/15 Wine Macon Lugny les Cray https://www.saq.com/en/13319061 DC France Bourgogne white +V 2022/02/16 Wine Brumont Cotes de Gascogne https://www.saq.com/en/548883 DC France Sauvignon, Gros Manseng white 12 +W 2022/02/18 Wine Piuze https://www.saq.com/en/14853741 DC France Chardonnay white 12 +X 2022/02/19 Wine Chateau de Maligny https://www.saq.com/en/560763 DC France Chablis Chardonnay white 12.5 +Y 2022/02/21 Wine L'impromptu https://www.saq.com/en/13343264 DC France Gamay red 14 +Z 2022/02/22 Wine Sancerres Aurore Dezat https://www.saq.com/en/13992897 DC France Sancerre Chardonnay 1.6 white 12.5 +AA 2022/02/26 Wine Lord de la Ragotiere https://www.saq.com/en/10690501 DC France Chardonnay white 12 \ No newline at end of file