diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/PyVino.iml b/.idea/PyVino.iml
new file mode 100644
index 0000000..d0876a7
--- /dev/null
+++ b/.idea/PyVino.iml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/dataSources.xml b/.idea/dataSources.xml
new file mode 100644
index 0000000..5b2b08f
--- /dev/null
+++ b/.idea/dataSources.xml
@@ -0,0 +1,17 @@
+
+
+
+
+ sqlite.xerial
+ true
+ org.sqlite.JDBC
+ jdbc:sqlite:$PROJECT_DIR$/raman.db
+ $ProjectFileDir$
+
+
+ file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.34.0/sqlite-jdbc-3.34.0.jar
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..1b700c1
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..d1e22ec
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..d2c2de9
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/ramandb.py b/ramandb.py
index 52ed268..3725b18 100644
--- a/ramandb.py
+++ b/ramandb.py
@@ -1,32 +1,85 @@
+import dcclab
from dcclab.database import *
import numpy as np
import requests
+from BaselineRemoval import BaselineRemoval
+import re
-class RamanDB(Database):
- url = 'https://www.dropbox.com/s/peowchyj7xyib4w/raman.db?dl=1'
- def __init__(self, writePermission=False):
+class RamanDB(dcclab.database.Database):
+ def __init__(self, databaseURL = None):
"""
- Creates the database object for Raman spectra.
+ The Database is a MySQL database on cafeine called `raman`.
"""
-
- self.databasePath = "raman.db"
- if not os.path.exists(self.databasePath):
- print("The raman.db file is not available. Atttempting to download from {0}".format(self.url))
- filename = self.downloadDatabase()
- if os.path.exists(filename) and not os.path.exists(self.databasePath):
- os.rename(filename, self.databasePath)
- print("Success. File has been renamed raman.db")
+ if databaseURL is None:
+ databaseURL = "mysql://dcclab@cafeine2.crulrg.ulaval.ca/dcclab@raman"
self._wavelengths = None
+ self._wavelengthMask = None
self.progressStart = None
- super().__init__(self.databasePath, writePermission=writePermission)
+ self.constraints = []
+ self.pumpWavelengthInNm = 785
+ super().__init__(databaseURL)
+
+ if dcclab.__version__ < "1.0.3":
+ print("You should update PyDCCLab with `pip install dcclab` to get the latest version.")
+
+ def showHelp(self):
+ print("""
+ All wines obtained from the group are in this database. Things to know:
+ * Wines are identified with a "wineId" that is A,B,C, .... AA, AB, AC, .... etc.
+ * Each wine has a number a spectrum acquisitions associated with it (typically 30, 60, etc...)
+ * When a Raman spectrum is acquired
+
+ """)
+
+ def execute(self, statement, bindings=None):
+ """
+ This function with "bindings" is necessary to handle binary data: it cannot be inserted with a string statement.
+ The bindings are explained here: https://zetcode.com/db/sqlitepythontutorial/ and are similar to .format()
+ but are handled properly by the sqlite3 module instead of a python string. Without it, binary data
+ is inserted as a string, which is not good.
+
+ See insertFileContentIntoSources() for an example.
+
+ """
+ if bindings is None:
+ super().execute(statement) # Call the original function from dcclab.database
+ else:
+ self.cursor.execute(statement, bindings)
+
+ def executeCount(self, statement, bindings=None):
+ """
+ This function with "bindings" is necessary to handle binary data: it cannot be inserted with a string statement.
+ The bindings are explained here: https://zetcode.com/db/sqlitepythontutorial/ and are similar to .format()
+ but are handled properly by the sqlite3 module instead of a python string. Without it, binary data
+ is inserted as a string, which is not good.
+
+ See insertFileContentIntoSources() for an example.
+
+ """
+ self.execute(statement, bindings)
+ singleRecord = self.fetchOne()
+ keys = list(singleRecord.keys())
+ if len(keys) == 1:
+ return int(singleRecord[keys[0]])
+ else:
+ return None
- def downloadDatabase(self):
- r = requests.get(self.url, allow_redirects=True)
- filename = "raman-download.db"
- with open(filename, 'wb') as file:
- file.write(r.content)
- return filename
+ def parseURL(self, url):
+ #mysql://sshusername:sshpassword@cafeine2.crulrg.ulaval.ca/mysqlusername:mysqlpassword@questions
+ if dcclab.__version__ >= "1.0.4":
+ print("No need to patch parseURL in this dcclab version")
+
+ match = re.search("(mysql)://(.*?)@?([^@]+?)/(.*?)@(.+)", url)
+ if match is not None:
+ protocol = Engine.mysql
+ sshuser = match.group(2)
+ host = match.group(3)
+ mysqluser = match.group(4)
+ database = match.group(5)
+ return (protocol, sshuser, host, mysqluser, database)
+ else:
+ return (Engine.sqlite3, None, "127.0.0.1", None, url)
@property
def wavelengths(self):
@@ -35,8 +88,96 @@ def wavelengths(self):
return self._wavelengths
+ @property
+ def wavenumbers(self):
+ return 1e7*(1.0/self.pumpWavelengthInNm - 1.0/self.wavelengths)
+
+ @property
+ def wavelengthMask(self):
+ if self._wavelengthMask is None:
+ self._wavelengthMask = self.getWavelengthMask()
+
+ return self._wavelengthMask
+
+ def getWavelengthMask(self):
+ self.execute(r"select distinct(wavelength), intensity from spectra where dataType='mask-wine' order by wavelength")
+ rows = self.fetchAll()
+ nTotal = len(rows)
+
+ if nTotal != 0:
+ mask = np.zeros(shape=(nTotal),dtype=bool)
+ for i,row in enumerate(rows):
+ mask[i] = bool(row['intensity'])
+ else:
+ mask = np.zeros(shape=(len(self.wavelengths)))
+ for i in range(200, 1000):
+ mask[i] = True
+ self.insertSpectralData(wavelengths=self.wavelengths, intensities=mask, dataType='mask-wine', wineId=None, sampleId=None, algorithm='BaselineRemoval')
+
+ return mask
+
+ def readQEProFile(self, filePath):
+ # text_file = open(filePath, "br")
+ # hash = hashlib.md5(text_file.read()).hexdigest()
+ # text_file.close()
+
+ with open(filePath, "r") as text_file:
+ lines = text_file.read().splitlines()
+
+ wavelengths = []
+ intensities = []
+ for line in lines:
+ match = re.match(r'^\s*(\d+\.?\d+)\s+(-?\d*\.?\d*)', line)
+ if match is not None:
+ intensity = match.group(2)
+ wavelength = match.group(1)
+ wavelengths.append(wavelength)
+ intensities.append(intensity)
+ else:
+ pass
+ # print("Line does not match: {0}".format(line))
+ return wavelengths, intensities
+
+ def insertSpectralDataFromFiles(self, filePaths, dataType='raw'):
+ inserted = 0
+ for filePath in filePaths:
+ match = re.search(r'([A-Z]{1,2})_?(\d{1,3})\.', filePath)
+ if match is None:
+ raise ValueError("The file does not appear to have a valid name: {0}".format(filePath))
+
+ wineId = int(ord(match.group(1))-ord('A'))
+ sampleId = int(match.group(2))
+ spectrumId = "{0:04}-{1:04d}".format(wineId, sampleId)
+
+ wavelengths, intensities = self.readQEProFile(filePath)
+ try:
+ self.insertSpectralData(wavelengths, intensities, dataType, wineId, sampleId)
+ print("Inserted {0}".format(filePath))
+ inserted += 1
+ except ValueError as err:
+ print(err)
+
+ return inserted
+
+ def insertSpectralData(self, wavelengths, intensities, dataType, wineId, sampleId, algorithm=None):
+ if wineId is None or sampleId is None:
+ spectrumId = None
+ else:
+ spectrumId = "{0:04}-{1:04d}".format(wineId, sampleId)
+
+ count = self.executeCount('select count(*) as count from spectra where spectrumId = "{0}" and dataType = "{1}"'.format(spectrumId, dataType))
+ if count != 0 :
+ raise ValueError("Spectrum {0} already exists with dataType='{1}'".format(spectrumId, dataType))
+
+ values = []
+ for x,y in zip(wavelengths, intensities):
+ values.append("({0}, {1}, '{2}', '{3}', '{4}', '{5}', now(), '{6}') ".format(x,float(y), dataType, wineId, sampleId, spectrumId, algorithm))
+
+ bigStatement = "insert into spectra (wavelength, intensity, dataType, wineId, sampleId, spectrumId, dateAdded, algorithm) values" + ','.join(values)
+ self.execute( bigStatement)
+
def getWavelengths(self):
- self.execute(r"select distinct(wavelength) from spectra order by wavelength")
+ self.execute(r"select distinct(wavelength) from spectra where dataType='raw' order by wavelength")
rows = self.fetchAll()
nTotal = len(rows)
@@ -46,15 +187,43 @@ def getWavelengths(self):
return wavelengths
+ def getDataTypes(self):
+ self.execute('select distinct dataType from spectra')
+ rows = self.fetchAll()
+ dataTypes = []
+ for row in rows:
+ dataTypes.append(row["dataType"])
+
+ return dataTypes
- def getCountFiles(self):
+ def getWineIds(self):
+ self.execute(r"select count(*) as count, wineId as id from files group by wineId order by wineId;")
+ rows = self.fetchAll()
+ identifiers = {}
+ for row in rows:
+ id = row["id"]
+ nSamples = row["count"]
+ identifiers[id] = nSamples
+ return identifiers
+
+ def getWinesSummary(self):
+ # mysql.connector.errors.ProgrammingError: 1055(
+ # 42000): Expression # 4 of SELECT list is not in GROUP BY clause and contains nonaggregated column 'raman.wines.dateOpened' which is not functionally dependent on columns in GROUP BY clause; this is incompatible with sql_mode=only_full_group_by
+
+ self.execute(r"select files.wineId, count(*) as nSamples, wines.* from files inner join wines on wines.wineId = files.wineId group by files.wineId order by files.wineId")
+ rows = self.fetchAll()
+ wines = []
+ for row in rows:
+ wines.append(dict(row))
+ return wines
+
+ def getFileCount(self):
self.execute(r"select count(*) as count from files")
rows = self.fetchAll()
if rows is None:
return 0
return rows[0]["count"]
-
def getSpectraPaths(self):
self.execute("select path from files order by path")
rows = self.fetchAll()
@@ -63,37 +232,118 @@ def getSpectraPaths(self):
paths.append(row['path'])
return paths
- def getIntensities(self, limit=None):
+ def getSpectrum(self, dataType, spectrumId):
+ whereConstraints = []
+ possibleDataTypes = self.getDataTypes()
+
+ if dataType is None:
+ dataType = 'raw'
+ if dataType not in possibleDataTypes:
+ raise ValueError('Possible dataTypes are {0}'.format(possibleDataTypes))
+ whereConstraints.append("dataType = '{0}'".format(dataType))
+
+ whereConstraints.append("spectrumId = '{0}'".format(spectrumId))
+
+ if len(whereConstraints) != 0:
+ whereClause = "where " + " and ".join(whereConstraints)
+ else:
+ whereClause = ""
+
stmnt = """
- select wavelength, intensity, files.path from spectra
- inner join files on files.fid = spectra.fid
- order by files.path, wavelength """
+ select wavelength, intensity, spectra.spectrumId from spectra
+ {0}
+ order by spectra.spectrumId, spectra.wavelength """.format(whereClause )
wavelengths = self.getWavelengths()
nWavelengths = len(wavelengths)
+ self.execute(stmnt)
+
+ rows = []
+ row = self.fetchOne()
+ while row is not None:
+ rows.append(row)
+ if len(rows) % 100 == 0:
+ print(".", end='')
+ row = self.fetchOne()
+
+ nSamples = len(rows)//nWavelengths
+ if nSamples == 0:
+ return None, None
+
+ spectra = np.zeros(shape=(nWavelengths, nSamples))
+ spectrumIdentifiers = [""]*nSamples
+ for i,row in enumerate(rows):
+ spectra[i%nWavelengths, i//nWavelengths] = float(row['intensity'])
+ spectrumIdentifiers[i//nWavelengths] = row['spectrumId']
+
+ return spectra, spectrumIdentifiers
+
+ def getSpectraWithId(self, dataType=None, color=None, limit=None):
+ whereConstraints = []
+ possibleDataTypes = self.getDataTypes()
+
+ if dataType is None:
+ dataType = 'raw'
+ if dataType not in possibleDataTypes:
+ raise ValueError('Possible dataTypes are {0}'.format(possibleDataTypes))
+ whereConstraints.append("dataType = '{0}'".format(dataType))
+
+ if color is not None:
+ whereConstraints.append(' wineId in (select wineId from wines where color="{0}") '.format(color))
+
+ if len(whereConstraints) != 0:
+ whereClause = "where " + " and ".join(whereConstraints)
+ else:
+ whereClause = ""
+
+ stmnt = """
+ select wavelength, intensity, spectra.spectrumId
+ from spectra
+ {0}
+ order by spectra.spectrumId, spectra.wavelength """.format(whereClause )
+
+ wavelengths = self.wavelengths
+ nWavelengths = len(wavelengths)
+
if limit is not None:
stmnt += " limit {0}".format(limit*nWavelengths)
self.execute(stmnt)
- rows = list(self.fetchAll())
- if rows is None:
- return None
-
+ rows = []
+ row = self.fetchOne()
+ while row is not None:
+ rows.append(row)
+ if len(rows) % 100 == 0:
+ print(".", end='')
+ row = self.fetchOne()
+
nSamples = len(rows)//nWavelengths
if nSamples == 0:
return None
spectra = np.zeros(shape=(nWavelengths, nSamples))
- wineIdentifiers = [""]*nSamples
+ spectrumIdentifiers = [""]*nSamples
for i,row in enumerate(rows):
spectra[i%nWavelengths, i//nWavelengths] = float(row['intensity'])
- match = re.search(r"([A-Z]+)_?\d+.txt", row["path"])
- if match is not None:
- wineIdentifiers[i//nWavelengths] = match.group(1)
+ spectrumIdentifiers[i//nWavelengths] = row['spectrumId']
+
+ return spectra, spectrumIdentifiers
+
+ def subtractFluorescence(self, rawSpectra, polynomialDegree=5):
+
+ """
+ Remove fluorescence background from the data.
+ :return: A corrected data without the background.
+ """
+
+ correctedSpectra = np.empty_like(rawSpectra)
+ for i in range(rawSpectra.shape[1]):
+ spectrum = rawSpectra[:, i]
+ correctedSpectra[:, i] = BaselineRemoval(spectrum).IModPoly(polynomialDegree)
- return spectra, wineIdentifiers
+ return correctedSpectra
def showProgressBar(self, iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
"""
diff --git a/testDatabase.py b/testDatabase.py
index 2d2bdca..9678632 100644
--- a/testDatabase.py
+++ b/testDatabase.py
@@ -4,67 +4,167 @@
import os
from ramandb import RamanDB
import requests
+import re
-class TestBuildDatabase(unittest.TestCase):
- def testDatabase(self):
- db = RamanDB()
- self.assertIsNotNone(db)
- self.assertTrue(os.path.exists(db.databasePath))
+class TestRamanDatabase(unittest.TestCase):
+ def setUp(self):
+ self.db = RamanDB()
+ # self.db = RamanDB("mysql://127.0.0.1/root@raman")
+ self.assertIsNotNone(self.db)
- def testWavelengths(self):
- db = RamanDB()
- self.assertIsNotNone(db.getWavelengths())
- self.assertEqual(len(db.getWavelengths()), 1044)
-
- def testWavelengthsProperty(self):
- db = RamanDB()
- self.assertIsNotNone(db.wavelengths)
- self.assertEqual(len(db.wavelengths), 1044)
-
- def testFileCount(self):
- db = RamanDB()
- self.assertIsNotNone(db.getCountFiles())
- self.assertEqual(db.getCountFiles(), 709)
-
- def testFilePaths(self):
- db = RamanDB()
- self.assertIsNotNone(db.getSpectraPaths())
- self.assertEqual(db.getCountFiles(), len(db.getSpectraPaths()))
-
- def testGetIntensity(self):
- db = RamanDB()
- matrix, labels = db.getIntensities()
+ @unittest.skip("Now in setUp")
+ def test01Database(self):
+ self.db = RamanDB()
+ self.assertIsNotNone(self.db)
+
+ def test02Wavelengths(self):
+ self.assertIsNotNone(self.db.getWavelengths())
+
+ def test03WavelengthsAreUniqueAndCommon(self):
+ """
+ Check that all RAW spectra have the same number of wavelengths.
+ This is a complex SQL statement with a sub-select, but it returns 1 if true and 0 if false.
+ """
+ self.db.execute("""
+ SELECT
+ MAX(spectralPts) = MIN(spectralPts) as wavelengthsAreAllTheSame
+ FROM
+ (SELECT
+ COUNT(wavelength) AS spectralPts
+ FROM
+ spectra
+ where dataType='raw'
+ GROUP BY wavelength) AS something;
+ """)
+ firstRecord = self.db.fetchOne()
+ self.assertEqual(firstRecord["wavelengthsAreAllTheSame"], 1)
+
+ def test04WavelengthsProperty(self):
+ self.assertIsNotNone(self.db.wavelengths)
+
+ def test05FileCount(self):
+ self.assertIsNotNone(self.db.getFileCount())
+
+ def test06FileCountShouldMatchRawSpectraTimesWavelength(self):
+ """
+ NUmber of points in the spectra database for 'raw' spectra should be #wavelengths x #files
+ """
+ rawSpectraCount = self.db.getFileCount()
+ wavelengthsCount = len(self.db.getWavelengths())
+
+ self.db.execute("select count(*) as count from spectra where dataType='raw'")
+ valueRecord = self.db.fetchOne()
+ self.assertEqual(valueRecord["count"], rawSpectraCount*wavelengthsCount)
+
+ def test07FilePaths(self):
+ self.assertIsNotNone(self.db.getSpectraPaths())
+ self.assertEqual(self.db.getFileCount(), len(self.db.getSpectraPaths()))
+
+ def test08GetWhiteSpectra(self):
+ self.db.execute("select count(*) as count from files inner join wines on wines.wineId = files.wineId where wines.color = 'white'")
+ firstRecord = self.db.fetchOne()
+ whiteWineFileCount = firstRecord["count"]
+
+ matrix, labels = self.db.getSpectraWithId(dataType='raw', color='white')
+ self.assertIsNotNone(matrix)
+
+ self.assertEqual(matrix.shape, (len(self.db.wavelengths), whiteWineFileCount))
+
+ def test09GetRedSpectra(self):
+ self.db.execute("select count(*) as count from files inner join wines on wines.wineId = files.wineId where wines.color = 'red'")
+ firstRecord = self.db.fetchOne()
+ redWineFileCount = firstRecord["count"]
+
+ matrix, labels = self.db.getSpectraWithId(dataType='raw', color='red')
self.assertIsNotNone(matrix)
- self.assertEqual(matrix.shape, (len(db.wavelengths), db.getCountFiles()))
-
- @unittest.skip("Ok, tested")
- def testDownload(self):
- url = 'https://www.dropbox.com/s/2st0sv7jpii6dz8/raman.db?dl=1'
- r = requests.get(url, allow_redirects=True)
- with open('test.db', 'wb') as file:
- file.write(r.content)
-
- @unittest.skip("Ok, tested")
- def testDownload(self):
- db = RamanDB()
- filename = db.downloadDatabase()
- self.assertTrue(os.path.exists(filename))
- os.remove(filename)
-
- @unittest.skip("Done, no need to redo.")
- def testAddFileIdToDatabase(self):
- db = RamanDB(writePermission=True)
- db.execute("select * from files order by path")
- records = db.fetchAll()
- for i, record in enumerate(records):
- db.execute("update files set fid={0} where md5='{1}'".format(i, record["md5"]))
-
- db.execute("select spectra.md5, files.fid from spectra inner join files on files.md5 = spectra.md5")
- records = db.fetchAll()
- for i, record in enumerate(records):
- statement = "update spectra set fid={0} where md5='{1}'".format(record["fid"], record["md5"])
- db.execute(statement)
+ self.assertEqual(matrix.shape, (len(self.db.wavelengths), redWineFileCount))
+
+ def test10ReadQEProFile(self):
+ wavelengths, intensities = self.db.readQEProFile('originaldata/Q100.txt')
+ self.assertEqual(len(intensities), 1044)
+
+ def test11InsertAllSpectra(self):
+ dataDir = 'originaldata'
+ filenames = os.listdir(dataDir)
+ filePaths = []
+ for filename in filenames:
+ filePaths.append(os.path.join(dataDir, filename))
+
+ inserted = self.db.insertSpectralDataFromFiles(filePaths)
+ if inserted == 0:
+ self.skipTest("Nothing was inserted")
+
+ def test12ExecuteCount(self):
+ self.assertTrue(self.db.executeCount("select count(*) as count from spectra") > 0)
+
+ def test13InsertAllCorrectedSpectra(self):
+ self.db.execute("select distinct spectrumId from spectra where spectrumId not in (select spectrumId from spectra where dataType='fluorescence-corrected')")
+ records = self.db.fetchAll()
+ if len(records) == 0:
+ self.skipTest("All corrected spectra exist in the database")
+
+ for record in records:
+ spectrumId = record["spectrumId"]
+ spectrum, labels = self.db.getSpectrum(dataType='raw', spectrumId=spectrumId)
+ if spectrum is None:
+ continue
+ degree = 100
+ correctedSpectrum = self.db.subtractFluorescence(spectrum, polynomialDegree=degree)
+ print(spectrumId)
+ match = re.search(r"(\d+)-(\d+)", spectrumId)
+ wineId = int(match.group(1))
+ sampleId = int(match.group(2))
+ self.db.insertSpectralData(self.db.wavelengths, correctedSpectrum[:,:], 'fluorescence-corrected', wineId, sampleId, 'BaselineRemoval-nomask-degree{0}'.format(degree))
+
+ @unittest.skip("done")
+ def test14BuildWineIdAndSampleId(self):
+ self.db.execute('update files set sampleId=substr(path,18,2) where path like "%\_%" ESCAPE "\"')
+
+ def test15WinesSummary(self):
+ wineSummary = self.db.getWinesSummary()
+ totalNumberOfSpectra = sum([ wine["nSamples"] for wine in wineSummary])
+
+ self.db.execute("select count(*) as count from spectra where dataType='raw'")
+ valueRecord = self.db.fetchOne()
+ self.assertEqual(valueRecord["count"], totalNumberOfSpectra*len(self.db.getWavelengths()))
+
+ def test16SingleSpectrum(self):
+ self.db.execute("select wavelength, intensity from spectra where spectrumId = '0002-0001'")
+ records = self.db.fetchAll()
+ for record in records:
+ print(record)
+
+ def test17DataTypes(self):
+ self.assertTrue('raw' in self.db.getDataTypes())
+
+ def test18GetSpectraValidTypeFluorescence(self):
+ if 'fluorescence-corrected' in self.db.getDataTypes():
+ spectra, spectrumIds = self.db.getSpectraWithId(dataType='fluorescence-corrected')
+ self.assertIsNotNone(spectra)
+ else:
+ self.skipTest("No background-corrected spectra in database")
+
+ def test19GetSpectraInvalidType(self):
+ with self.assertRaises(ValueError):
+ spectra = self.db.getSpectraWithId(dataType='unknown')
+
+ @unittest.skip("Only on dccote's computer")
+ def test20DatabaseMySQLLocal(self):
+ db = RamanDB("mysql://127.0.0.1/root@raman")
+ self.assertIsNotNone(db)
+ self.assertIsNotNone(db.getWavelengths())
+
+ def test21Wavenumbers(self):
+ self.assertIsNotNone(self.db.wavenumbers)
+
+ def test22Mask(self):
+ print(sum(self.db.wavelengthMask))
+ maskRange = []
+ for i, mask in enumerate(self.db.wavelengthMask):
+ if mask:
+ maskRange.append(i)
+ print(self.db.wavelengths[maskRange])
if __name__ == "__main__":
unittest.main()
\ No newline at end of file
diff --git a/testVino.py b/testVino.py
new file mode 100644
index 0000000..6d54842
--- /dev/null
+++ b/testVino.py
@@ -0,0 +1,81 @@
+import unittest
+import numpy as np
+from dcclab import Database
+import os
+from ramandb import RamanDB
+import requests
+import matplotlib.pyplot as plt
+from vino import vinoPCA
+
+class TestVInoClass(unittest.TestCase):
+ @unittest.skip("NOt now")
+ def testInit(self):
+ iterable = [31, 30, 30, 30, 80, 31, 33, 31, 30, 30, 30, 30, 30, 30, 30, 30, 104, 30, 30] # sans vin blanc parceque ça shit le aspect ratio
+ total = sum(iterable)
+
+ # Data = np.genfromtxt('/Users/Shooshoo/PycharmProjects/PCA_DCCLab/DataVino_Sorted.csv', delimiter=',')
+ db = RamanDB()
+ data, labels = db.getIntensities()
+ wavelengths = db.getWavelengths()
+ data = np.cat(wavelengths, wavelengths, data[:,0:total])
+ self.assertEqual(data.shape[1], total)
+ my_Spectrums = vinoPCA(data, iterable)
+
+ self.assertIsNotNone(my_Spectrums)
+
+ def testRemoveFluo(self):
+ iterable = [31, 30, 30, 30, 80, 31, 33, 31, 30, 30, 30, 30, 30, 30, 30, 30, 104, 30, 30] # sans vin blanc parceque ça shit le aspect ratio
+ total = sum(iterable)
+
+ # I need to remove this function, I don't have access to the csv file.
+ # Data = np.genfromtxt('/Users/Shooshoo/PycharmProjects/PCA_DCCLab/DataVino_Sorted.csv', delimiter=',')
+ # After a bit of playing around: column 0 is not used, column 1 is the wavelengths, then its
+ # the data
+ my_Spectrums = vinoPCA()
+
+ self.assertIsNotNone(my_Spectrums)
+
+ my_Spectrums.subtractFluorescence()
+
+
+ def testDoPCA(self):
+ my_Spectrums = vinoPCA()
+
+ self.assertIsNotNone(my_Spectrums)
+
+ my_Spectrums.doPCA(10)
+ my_Spectrums.showTransformedData3D()
+ my_Spectrums.showTransformedData2D()
+ my_Spectrums.showEigenvectors()
+
+ def testvinoPCANoArgument(self):
+ my_Spectrums = vinoPCA()
+ self.assertIsNotNone(my_Spectrums)
+
+ my_Spectrums.doPCA(3)
+ my_Spectrums.showTransformedData3D()
+ my_Spectrums.showTransformedData2D()
+ my_Spectrums.showEigenvectors()
+
+ # def testInitDB(self):
+ # self.assertIsNotNone(vinoPCA().db)
+
+ def testColormap(self):
+ vino = vinoPCA()
+ cm = vino.getColorMap()
+ self.assertIsNotNone(cm)
+ spectra, labels = vino.db.getSpectraWithId()
+ self.assertEqual(len(cm), len(labels))
+
+ # def testOneSpectrum(self):
+ # vino = vinoPCA()
+ # spectra, labels = vino.db.getIntensities()
+ # plt.plot(spectra[:,1])
+ # newSpectra = vino.removeFLuo(spectra)
+ # print(newSpectra)
+ # # plt.plot(newSpectra)
+ # # plt.show()
+
+
+if __name__ == "__main__":
+ unittest.main()
\ No newline at end of file
diff --git a/vino.py b/vino.py
index 2f40e0f..01db57c 100644
--- a/vino.py
+++ b/vino.py
@@ -4,19 +4,23 @@
from sklearn.decomposition import PCA
from scipy import interpolate
from BaselineRemoval import BaselineRemoval
-
+from ramandb import RamanDB
class vinoPCA:
- def __init__(self, Data, numberOfEachSamples):
+ def __init__(self):
+ self.db = RamanDB()
+ self.constraints = []
+ self.data, self.labels = self.db.getSpectraWithId(dataType='raw')
+ self.correctedData, correctedLabel = self.db.getSpectraWithId(dataType='fluorescence-corrected')
+ if self.labels != correctedLabel:
+ raise ValueError('Not all spectra are corrected')
- """
- :param Data: The data on wich PCA should be done.
- :param colormap: An iterable that contains how many of each samples there is in Data, in the good order.
- """
+ self.wavelengths = self.db.getWavelengths()
- self.Data = Data
- self.numberOfEachSamples = numberOfEachSamples
+ self.wavelengthMask = self.db.wavelengthMask
+ self.data = self.data[self.wavelengthMask, :]
+ self.wavelengths = self.wavelengths[self.wavelengthMask]
def getColorMap(self):
@@ -25,50 +29,32 @@ def getColorMap(self):
:return: Return a colormap to visualise different samples on the plot.
"""
- for i in range(0, len(self.numberOfEachSamples)):
- if i == 0:
- colormap = np.zeros(self.numberOfEachSamples[0])
- else:
- colormap = np.append(colormap, np.ones(self.numberOfEachSamples[i]) *5*i)
+ uniqueLabelsInOrder = sorted(set(self.labels))
+ possibleColorsInOrder = range(len(uniqueLabelsInOrder))
+ colors = {}
+ for identifier, color in zip(uniqueLabelsInOrder, possibleColorsInOrder):
+ colors[identifier] = color*5
+
+ colormap = []
+ for identifier in self.labels:
+ colormap.append(colors[identifier])
- return colormap
+ return np.array(colormap)
- def removeFLuo(self, Data):
+ def subtractFluorescence(self):
"""
- Remove fluorescence background from the data given.
- :param Data: The Data from witch you wish to remove fluo background.
- :return: A new set of Data without the background.
+ Remove fluorescence background from the data.
+ :return: A corrected data without the background.
"""
- nm = Data[:, 1]
- cm = 1 / (632.8e-9) - 1 / (nm * 1e-9)
- size = np.ma.size(Data, 1)
- polynomial_degree = 100
- filtered_datas = np.zeros(shape=(800, size - 1))
+ polynomial_degree = 5
+ correctedSpectra = np.empty_like(self.data)
+ for i in range(self.data.shape[1]):
+ spectre = self.data[:, i]
+ correctedSpectra[:, i] = BaselineRemoval(spectre).IModPoly(polynomial_degree)
- # for column in range(2, size):
- # y = Data[:, column]
- # d = 25
- # f2 = interpolate.interp1d(cm[199:][::d], y[199:][::d], kind='quadratic')
- # y = y[200:1000] - f2(cm[200:1000])
- # y = (y - min(y)) / max(y - min(y))
- # filt_datas[:, column - 1] = y
- # filt_datas[:, 0] = cm[200:1000]
-
- for column in range(2, size):
- spectre = Data[200:1000, column]
- baseObj = BaselineRemoval(spectre)
- values = baseObj.IModPoly(polynomial_degree)
- # values = values - min(values) # Si tu normalises, tu perds les composants communs (Alcool particulèrement)
- # values = values/max(values) # tu perds aussi le degrés de présence (Plus ou moins bouchonné ?)
- # Si tu normalises pas, tu favorises les composants communs présents à
- # différents degrés (Plus ou moins d'alcool). Donc tester avec et sans?
- filtered_datas[:, column - 1] = values
-
- filtered_datas[:, 0] = Data[200:1000, 1]
-
- return filtered_datas
+ return correctedSpectra
def doPCA(self, n:int):
@@ -77,11 +63,9 @@ def doPCA(self, n:int):
:param n: number of componants to get from the PCA
:return: Returns nothing. Just creats an array of the transformed datas into the new vector space
"""
-
- new_Datas = self.removeFLuo(self.Data)
- new_Datas = np.transpose(new_Datas)
- self.X_PCA = PCA(n_components=n)
- self.X_reduced = self.X_PCA.fit_transform(new_Datas[1:, :])
+ self.pca = PCA(n_components=n)
+ correctedData = self.subtractFluorescence()
+ self.X_reduced = self.pca.fit_transform(correctedData.T)
def showTransformedData3D(self):
@@ -94,9 +78,9 @@ def showTransformedData3D(self):
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
ax.scatter(
- self.X_reduced[:700, 0],
- self.X_reduced[:700, 1],
- self.X_reduced[:700, 2],
+ self.X_reduced[:, 0],
+ self.X_reduced[:, 1],
+ self.X_reduced[:, 2],
c=self.getColorMap(),
cmap='nipy_spectral',
s=10)
@@ -118,7 +102,7 @@ def showTransformedData2D(self):
plt.clf()
plt.figure(2)
- plt.scatter(self.X_reduced[:700, 0], self.X_reduced[:700, 1], c=self.getColorMap(), cmap='nipy_spectral', s=10)
+ plt.scatter(self.X_reduced[:, 0], self.X_reduced[:, 1], c=self.getColorMap(), cmap='nipy_spectral', s=10)
plt.title('First two PCA directions')
plt.xlabel('1st eigenvector')
plt.ylabel('2nd eigenvector')
@@ -138,7 +122,7 @@ def getAllEigenvectors(self):
:return: an array of n eigenvector
"""
- return self.X_PCA.components_.transpose()
+ return self.pca.components_.transpose()
def showEigenvectors(self):
@@ -148,13 +132,13 @@ def showEigenvectors(self):
"""
plt.figure(3)
plt.title('1st eigenvector')
- plt.plot(self.X_PCA.components_.transpose()[:, 0])
+ plt.plot(self.pca.components_.transpose()[:, 0])
plt.figure(4)
plt.title('2nd eigenvector')
- plt.plot(self.X_PCA.components_.transpose()[:, 1])
+ plt.plot(self.pca.components_.transpose()[:, 1])
plt.figure(5)
plt.title('3rd eigenvector')
- plt.plot(self.X_PCA.components_.transpose()[:, 2])
+ plt.plot(self.pca.components_.transpose()[:, 2])
plt.show()
def getTransformedDatas(self):
@@ -173,7 +157,7 @@ def getScreeValues(self):
:return: array of the scree values, from most important to least
"""
- return self.X_PCA.explained_variance_ratio_
+ return self.pca.explained_variance_ratio_
def plotScreeValues(self):
diff --git a/wines.txt b/wines.txt
new file mode 100644
index 0000000..3b19938
--- /dev/null
+++ b/wines.txt
@@ -0,0 +1,27 @@
+A 2022/01/12 Wine Sirius Bordeaux 2018 https://www.saq.com/en/223537 VPN France Merlot, Cabernet Sauvignon 2.2 red 13
+B 2022/01/12 Wine Ménage à Trois 2019 https://www.saq.com/en/10709152 VPN United States Cabernet Sauvignon 4.3 red 13.5
+C 2022/01/22 Wine Woodbridge by Robert Mondavi https://www.saq.com/en/48611 VPN United States Cabernet Sauvignon 7.3 red 13.5
+D 2022/01/28 Wine Les Jamelles Pinot Noir Pays d'Oc https://www.saq.com/en/10802904 VPN France point noir 4 red 13
+E 2022/01/27 Wine Monasterio de las Vinas https://www.saq.com/en/854422 VPN Spain 70% Garnacha, 20% Tempranillo, 10% Carinena 2.1 red 13.5
+F 2022/02/05 Wine Revolution https://www.saq.com/en/12166892 EP United States Ruby cabernet 50 %, Carignan 32 %, Syrah 18 % 10 red 13.5
+G 2022/02/12 Wine Milhistoraise https://www.saq.com/en/13794111 EP Spain Grenache 1.7 red 14
+H 2022/02/13 Wine Wallaroo Trail Shiraz https://www.saq.com/en/12498459 EP Australia Shiraz 85 %, Cabernet sauvignon 10 %, Petit verdot 5 % 11 red 13.5
+I 2022/02/13 Wine Toro loco https://futailles.com/en/products/wine/red/toro-loco EP Spain Tempranillo 0 red 12.5
+J 2022/02/13 Wine Cantini https://vinstriani.com/produits/cantini-rouge.html EP Italy Sangiovese, Montepulciano, and Cabernet Sauvignon - red 12
+K 2022/02/13 Wine Nicolas laloux https://www.vinsenepicerie.com/en/nicolas-laloux-1/ EP Ontario.Canada Cabernet Sauvignon - red 12.5
+L 2022/02/13 Wine smoky bay SHIRAZ https://www.lcbo.com/webapp/wcs/stores/servlet/en/lcbo/red-wine-14001/smoky-bay-shiraz-17650#.YguvavXMIUo EP Australia Shiraz 10 red 13
+M 2022/02/13 Wine Dolce Venti https://futailles.com/en/products/wine/red/dolce-venti EP Italy Merlot - red 11.5
+N 2022/02/13 Wine Aroma mi Amore https://vinsarista.com/en/produit/wines/aroma-mi-amore/aroma-mi-amore-red-wine/ EP Italy Refosco - red 14.5
+O 2022/02/19 Wine Sonho Aragonez https://www.vivino.com/CA/en/sonho-aragonez/w/5905886 EP Portugal Aragonez red 12.5
+P 2022/02/27 Wine Double vie https://vinsarista.com/en/produit/wines/double-vie/red-wine/ EP Canada red 12
+Q 2022/02/28 Wine Danza https://www.iga.net/en/product/wineargentinian-red-bonarda/00000_000000082424300222 EP Argentina Douce noir red 13.7
+R 2022/02/23 Wine bu https://www.iga.net/en/product/winered-rosso-terre-sicilaine-bio-it/00000_000000005604913702 EP Italy Nero d'Avola 70% + Merlot 20% + Syrah 10% red 12.5
+S 2022/02/24 Wine Croix d'Or https://futailles.com/en/products/wine/red/croix-dor EP Moldavie pinot noir red 12.5
+T 2022/02/18 Wine AUFKELLEREIEN https://www.iga.net/fr/produit/vin-blancallemagne---fruite-et-doux-9--alcool---18-ans--/00000_000000005604980687 AR Allemagne white 9
+U 2022/02/15 Wine Macon Lugny les Cray https://www.saq.com/en/13319061 DC France Bourgogne white
+V 2022/02/16 Wine Brumont Cotes de Gascogne https://www.saq.com/en/548883 DC France Sauvignon, Gros Manseng white 12
+W 2022/02/18 Wine Piuze https://www.saq.com/en/14853741 DC France Chardonnay white 12
+X 2022/02/19 Wine Chateau de Maligny https://www.saq.com/en/560763 DC France Chablis Chardonnay white 12.5
+Y 2022/02/21 Wine L'impromptu https://www.saq.com/en/13343264 DC France Gamay red 14
+Z 2022/02/22 Wine Sancerres Aurore Dezat https://www.saq.com/en/13992897 DC France Sancerre Chardonnay 1.6 white 12.5
+AA 2022/02/26 Wine Lord de la Ragotiere https://www.saq.com/en/10690501 DC France Chardonnay white 12
\ No newline at end of file