sdb/sdb_getphot.py at master · drgmk/sdb · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
#!/usr/bin/env python3

"""Extract photometry from command line

When run from command line the arguments can either specify a number of
individual sdbids, or a sample which resides in a table in
sed_db_samples.
"""

import argparse
import re
from collections import OrderedDict
from os import mkdir,rename,remove,utime
from os.path import isdir,isfile,getmtime,basename,dirname
import glob
import hashlib
import filelock
import numpy as np
from astropy.table import Table,vstack,unique,Column
import mysql.connector
import sqlite3
import config as cfg

import sdf.db
import sdf.filter


def sdb_write_rawphot(file,tphot,tspec):
    """Write raw photometry to a file.

    Takes filename to write to, and astropy Tables with photometry and
    information on spectra. Each contains metadata that is printed out
    as key=value. As the spectra are printed as instrument=file these
    are stored in reverse in the meta dictionary (i.e. instrument
    non-unique)

    TODO: Table.write can do the meta, they need to be "comments" and
    "keywords" entries in the meta dict.
    """

    fh = open(file,'w')
    print('\ photometry/spectra for '+tphot.meta['keywords']['id']['value']+'\n\\',file=fh)
    tphot.write(fh, format='ascii.ipac',
                formats={'Phot': '%6.4g', 'Err': '%6.4g', 'Sys': '%6.4g'})
    fh.close()


def filehash(file):
    """Return an md5 hash for a given file, ignoring IPAC comments."""
    hasher = hashlib.md5()
    buf = b''
    with open(file,'r') as f:
        for text in f:
            if len(text) <= 2: # "\" and newline at a minimum
                pass
            elif text[0:2] == '\ ':
                pass
            else:
#                print(len(text),"|"+text+"|")
                buf += text.encode()

    hasher.update(buf)
    return hasher.hexdigest()


def sdb_getphot_one(id, verb=False):
    """Extract photometry and other info from a database.

    Results are put in text files within sub-directories for the given
    id. Assuming some photometry exists, a 'public' directory is always
    created, and as many "private" ones as needed are created to ensure
    that private data are separated from public data and each other. The
    private directories also get an .htaccess file that requires a user
    to be in the group of the same name to view the contents.
    """

    # set up connection
    if cfg.db['type'] == 'sqlite':
        cnx, cursor = sdf.db.get_cnx(':memory:')
    elif cfg.db['type'] == 'mysql':
        cnx, cursor = sdf.db.get_cnx(cfg.db['db_sdb'])

    if cfg.db['type'] == 'mysql':
        # allow GROUP BY used by some xmatch queries
        cursor.execute("SET SESSION SQL_MODE = REPLACE(@@SQL_MODE,'ONLY_FULL_GROUP_BY','');")
    elif cfg.db['type'] == 'sqlite':
        cursor.execute(f'ATTACH DATABASE "{cfg.db["path"]}photometry.db" AS photometry')

    # set up temporary table with what we'll want in the output
    cursor.execute("CREATE TEMPORARY TABLE fluxes ( Band varchar(15) NOT NULL DEFAULT '', "
                   "Phot double DEFAULT NULL, Err double DEFAULT 0.0, Sys double DEFAULT 0.0, "
                   "Lim int(1) NOT NULL DEFAULT '0', Unit varchar(10) NOT NULL DEFAULT '', "
                   "bibcode varchar(30) NOT NULL DEFAULT '', "
                   "Note1 varchar(100) NOT NULL DEFAULT '', Note2 varchar(100) NOT NULL DEFAULT '', "
                   "SourceID varchar(100) DEFAULT NULL, private int(1) NOT NULL DEFAULT '0', "
                   "exclude int(1) NOT NULL DEFAULT '0');")

    # get sdbid and xids
    cursor.execute(f"SELECT DISTINCT sdbid FROM xids WHERE xid='{id}';")
    if cursor.rowcount > 1:
        print("Found multiple sdbids for given ID in xids {}, exiting".format(id))
        exit()
    elif cursor.rowcount == 0:
        print("No sdbid for ID in xids {}, exiting".format(id))
        exit()

    sdbid = cursor.fetchall()[0][0]

    # make cross id table to match on
    cursor.execute('DROP TABLE IF EXISTS TheIDs;')
    cursor.execute(f"CREATE TEMPORARY TABLE TheIDs AS SELECT * from xids WHERE sdbid = '{sdbid}';")
    cursor.execute('CREATE INDEX xidind ON TheIDs (xid);')

    # now add photometry to that table, use extra cursor
    cnx1, cursor1 = sdf.db.get_cnx(cfg.db['db_sdb'])
    cursor1.execute('SELECT incl,`table`,xid,band,col,ecol,syserr,'
                    'lim,bibcode,unit,com1,com2,'
                    'exclude,exclude_join,extra,private FROM xmatch;')
    for (incl,table,xid,band,col,ecol,sys,lim,bib,unit,c1,c2,
         excl,excl_join,extra,priv) in cursor1:
        if incl != 1:
            continue
        # columns to select
        # stmt = 'Insert INTO fluxes SELECT DISTINCT '+band+', '+col+', '+ecol+', '+sys+', '+lim+', %(unit)s, '+bib+', '+c1+', '+c2+', TheIDs.xid, %(priv)s'
        stmt = f'Insert INTO fluxes SELECT DISTINCT {band}, {col}, {ecol}, {sys}, {lim}, ' \
               f"'{unit}', {bib}, {c1}, {c2}, TheIDs.xid, {priv}"
        # excludes
        stmt += ',IF(0 '
        # table excludes, if band is present include must be null
        # (i.e. we are not actually including instead)
        if excl_join != None:
            stmt += ' OR IF(IF(exclude_band IS NOT NULL,1,0) AND IF(include IS NULL,1,0),1,0)'
        # column based excludes (which can be overridden by joined table)
        if excl != None and excl_join is None:
            stmt += ' OR ('+excl+')'
        if excl != None and excl_join != None:
            stmt += ' OR ( ('+excl+') AND IF(include is NOT NULL,0,1))'
        stmt += ',1,0)'
        # main join
        stmt += ' FROM TheIDs LEFT JOIN '+table+' ON TheIDs.xid = '+xid
        # join exlude table if exists
        if excl_join != None:
            stmt += ' LEFT JOIN phot_exclude ON ('+excl_join+'=phot_exclude.join_id ' \
                    'AND '+band+'=phot_exclude.exclude_band AND '+bib+'=phot_exclude.exclude_ref)'
        # require flux column not null
        stmt += ' WHERE '+col+' IS NOT NULL'
        # any extra conditions for WHERE
        if extra != None:
           stmt += ' '+extra

        # db-specific edits
        if cfg.db['type'] == 'sqlite':
            stmt = stmt.replace('IF(', 'IIF(')
        elif cfg.db['type'] == 'mysql':
            stmt = stmt.replace('LIKE', 'BINARY LIKE')

        if verb:
            print(stmt)
        cursor.execute(stmt)

    cursor1.close()
    cnx1.close()

    # now get the fluxes
    cursor.execute("SELECT DISTINCT * FROM fluxes;")
    tphot = Table(names=[desc[0] for desc in cursor.description],
                  dtype=('S15','f','f','f','i1','S10','S25','S200','S200','S100','i1','i1'))
    for row in cursor:
        try:
            tphot.add_row(row)
        except:
            print("ERROR: "+sdbid+" not exported, returning.")
            print("Probably wierd (+meaningless) characters in row\n",row)
            return()

    # fix any NULL values that went to nan
    tphot['Err'][np.invert(np.isfinite(tphot['Err']))] = 0.0
    tphot['Sys'][np.invert(np.isfinite(tphot['Sys']))] = 0.0

    # add wavelengths, sort, and remove
    tphot['wave'] = np.zeros(len(tphot))
    for i,filt in enumerate(tphot['Band']):
        if sdf.filter.iscolour(filt):
            f = sdf.filter.Colour.get(filt)
        else:
            f = sdf.filter.Filter.get(filt)

        tphot['wave'][i] = f.mean_wavelength

    tphot.sort('wave')
    tphot.remove_column('wave')

    # ready these to receive meta
    tphot.meta['keywords'] = {}
    tphot.meta['comments'] = []

    # get some addtional stellar data
    cursor.execute("SELECT main_id,raj2000,dej2000,sp_type,sp_bibcode,"
                   "COALESCE(simbad.plx_value,gaia_dr2.plx) AS plx_value,"
                   "COALESCE(simbad.plx_err,gaia_dr2.e_plx) AS plx_err,"
                   "COALESCE(plx_bibcode,CASE WHEN gaia_dr2.plx IS NULL THEN NULL ELSE '2018yCat.1345....0G' END) AS plx_bibcode "
                   "FROM sdb_pm LEFT JOIN simbad USING (sdbid) LEFT JOIN gaia_dr2 USING (sdbid) "
                   f"WHERE sdbid='{sdbid}';")
    vals = cursor.fetchall()
    keys = [desc[0] for desc in cursor.description]
    if len(vals) > 0:
        tphot.meta['keywords'] = OrderedDict( zip(keys,tuple(vals[0])) )
    tphot.meta['keywords']['id'] = sdbid

    # look in xids for an id if nothing from simbad table
    if tphot.meta['keywords']['main_id'] is None:
        cursor.execute(f"SELECT DISTINCT xid FROM xids WHERE sdbid='{sdbid}' "
                       "ORDER BY xid LIMIT 1")
        if cursor.rowcount == 1:
            tphot.meta['keywords']['main_id'] = cursor.fetchall()[0][0]

    # get aors for any spectra and add file names
    cursor.execute('SELECT instrument,aor_key,bibcode,private,IFNULL(exclude,0) as exclude FROM spectra '
                   'LEFT JOIN spectra_exclude USING (aor_key,instrument) '
                   f"WHERE sdbid = '{sdbid}' ORDER BY aor_key DESC;")
    tspec = Table(names=[desc[0] for desc in cursor.description],
                  dtype=('S20','i8','S19','i1','i1'))
    for row in cursor:
        tspec.add_row(row)

    cursor.close()
    cnx.close()

    # mark photometry duplicates to be excluded from fitting
    bs,js,ns = np.unique(tphot['Band'],
                          return_inverse=True,return_counts=True)
    for i,(band,n) in enumerate(zip(bs,ns)):
        if n == 1 or band not in cfg.phot['merge_dupes']:
            continue

        # indices where this band are in the phot table
        bib_i = np.where(js == i)[0]
        bibcodes = tphot['bibcode'][bib_i]
        srt = np.flipud(np.array(np.argsort(bibcodes)))
        # reversed argsort of these indices, this puts the highest dates
        # first, but alphabetical entries come first at this point
        srt = bib_i[srt]

        # now shift any non-[0-9] starting indices to the end
        r = re.compile('^[0-9]')
        vmatch = np.vectorize(lambda x:bool(r.match(x.decode())))
        num = vmatch(tphot['bibcode'][srt])
        if np.sum(num) < len(srt) and np.sum(num) > 0:
            num = np.where(num)[0]
            srt = np.roll(srt,len(srt)-np.min(num))

        # set exclude for everything beyond the first entry, unless any
        # of these were alredy excluded, in which case everything beyond
        # the first included measurement is excluded
        one_ok = False
        for j in srt:
            if one_ok is False and tphot['exclude'][j] == 0:
                one_ok = True
            else:
                tphot['exclude'][j] = 1

    # add empty column for file names
    add = np.chararray(len(tspec),itemsize=200)
    add[:] = ''
    tspec.add_column(Column(add,name='file'))
    for aor in tspec['aor_key']:
        for inst in cfg.spectra.keys():
            loc,g1,g2 = cfg.spectra[inst]
            specfile = glob.glob(cfg.file['spectra']+loc+g1+str(aor)+g2)
            if len(specfile) > 1:
                print("WARNING: More than one file for aor:",aor)
            elif len(specfile) == 1:
                file_path = loc+basename(specfile[0])
                tspec['file'][(tspec['aor_key']==aor)] = file_path

    # now add these as table metadata, adding a number to the instrument
    # name so that the keys are unique
    for i in range(len(tspec)):
        if tspec['file'][i] != '':
            if tspec['exclude'][i]:
                tphot.meta['comments'].append(tspec['instrument'][i]+str(i)+'='+tspec['file'][i])
            else:
                tphot.meta['keywords'][tspec['instrument'][i]+str(i)] = tspec['file'][i]

    # rearrange to ensure keyword structure is correct for IPAC ascii
    # format, which is {'keywords':{'keyword': {'value': value} }
    for key in tphot.meta['keywords'].keys():
        tphot.meta['keywords'][key] = {'value':tphot.meta['keywords'][key]}

    # find any private data by making table from phot and spec,
    # bibcode and private are the common columns
    tpriv = vstack([tphot,tspec],join_type='inner')
    if len(tpriv[tpriv['private'] == 1]) == 0:
        npriv = 0
    else:
        npriv = len(unique(tpriv[tpriv['private'] == 1]))

    # if there wasn't any photometry or spectra, then there's nothing to do
    if len(tpriv) == 0:
        print("No photometry or spectra for {}, skipping".format(sdbid))
        return
    elif len(tpriv) == len(tspec):
        print("No photometry, only spectra for {}, skipping".format(sdbid))
        return

    # write file(s), organising things if there is private data
    sedroot = cfg.file['sedroot']+sdbid+'/'
    if isdir(sedroot) == False:
        mkdir(sedroot)
    filename = sdbid+'-rawphot.txt'

    # make list of new dirs needed, 'public' will contain only public
    # photometry, 'all' will contain everything and is only needed if
    # there are multiple private sets of photometry
    newdirs = np.array(['public'])
    if npriv > 0:
        newdirs = np.append(newdirs,unique(tpriv[tpriv['private'] == 1])['bibcode'])
    if len(newdirs) > 2:
        newdirs = np.append(newdirs,'all')

    # go through samples, updating only if different
    cur = glob.glob(sedroot+'*/')
    for dir in newdirs:
        seddir = sedroot+dir+'/'

        # make dir if needed, if exists get or invent hash on old file
        oldhash = ''
        if isdir(seddir) == False:
            mkdir(seddir)
            mtime = getmtime(seddir)
        else:
            if isfile(seddir+filename):
                oldhash = filehash(seddir+filename)
                mtime = getmtime(seddir+filename)

        # create the .htaccess file if necessary
        if not isfile(seddir+'/.htaccess') and dir != 'public':
            fd = open(seddir+'/.htaccess','w')
            fd.write('AuthName "Must login"\n')
            fd.write('AuthType Basic\n')
            fd.write('AuthUserFile '+cfg.www['root']+'.htpasswd\n')
            fd.write('AuthGroupFile '+cfg.www['root']+'.htgroup\n')
            fd.write('require group admin '+dir+'\n')
            fd.close()

        # only write if we can get a lock, sdf may be fitting the old file
        lock = filelock.FileLock(seddir+'/.sdf_lock-'+filename)
        try:
            with lock.acquire(timeout = 0):

                # figure which rows to keep, and write temporary file
                okphot = np.logical_or(tphot['private'] == 0,
                                       tphot['bibcode'].astype(str) == dir.astype(str) )
                okspec = np.logical_or(tspec['private'] == 0,
                                       tspec['bibcode'].astype(str) == dir.astype(str) )
                tmpfile = cfg.file['sedtmp']
                if dir != 'all':
                    sdb_write_rawphot(tmpfile,tphot[okphot],tspec[okspec])
                else:
                    sdb_write_rawphot(tmpfile,tphot,tspec)

                # see if update needed, if so move it, otherwise delete
                if filehash(tmpfile) != oldhash:
                    if oldhash != '':
                        print(sdbid,dir,": Different hash, updating file")
                    else:
                        print(sdbid,dir,": Creating file")
                    rename(tmpfile,seddir+filename)

                    # hack to retain mod time of old file
        #            utime(seddir+filename,times=(mtime,mtime))
                else:
                    print(sdbid,dir,": Same hash, leaving old file")
                    remove(tmpfile)

                # remove this dir listing if it's there
                if seddir in cur:
                    cur.remove(seddir)

            # check for orphaned SED directories
            if len(cur) > 0:
                print("\nWARNING: orphaned directories exist:",cur,"\n")

        except filelock.Timeout:
            print(" WARNING: can't acquire lock for {}".format(dir+'/'+filename))
            print("  skipping")


def sdb_getphot(idlist, verb=False):
    """Call sdb_getphot for a list of ids."""

    if not isinstance(idlist, list):
        print("sdb_getphot expects a list")
        raise TypeError
    for id in idlist:
        sdb_getphot_one(id, verb=verb)


# run from the command line
if __name__ == "__main__":

    # inputs
    parser1 = argparse.ArgumentParser(description='Export photometry from SED DB')
    parser = parser1.add_mutually_exclusive_group(required=True)
    parser.add_argument('--idlist','-i',nargs='+',action='append',
                        dest='idlist',metavar='sdbid',help='Get photometry for one sdbid')
    parser.add_argument('--sample','-s',type=str,metavar='table',help='Get photometry for sample')
    parser.add_argument('--all','-a',action='store_true',help='Get all photometry')
    parser1.add_argument('--ra','-r',type=float,metavar='X',help='Only extract for RA>Xh with -a')
    parser1.add_argument('--dbname',type=str,help='Database containing sample table',
                         default=cfg.db['db_samples'],metavar=cfg.db['db_samples'])
    parser1.add_argument('--verb','-v',action='store_true',help='Be more verbose')
    args = parser1.parse_args()

    if args.idlist != None:
        print("Running sdb_getphot for list:",args.idlist[0])
        sdb_getphot(args.idlist[0], verb=args.verb)
    elif args.sample != None:
        print("Running sdb_getphot for targets in sample:",args.sample)
        cnx = mysql.connector.connect(user=cfg.db['user'],password=cfg.db['passwd'],
                                      host=cfg.db['host'],database=args.dbname,
                                      auth_plugin='mysql_native_password')
        cursor = cnx.cursor(buffered=True)
        cursor.execute("SELECT sdbid FROM "+args.dbname+"."+args.sample+" "
                       "WHERE sdbid IS NOT NULL;")
        for id in cursor:
            sdb_getphot_one(id[0], verb=args.verb)
        cursor.close()
        cnx.close()
    elif args.all != None:
        print("Running sdb_getphot for all targets")
        cnx = mysql.connector.connect(user=cfg.db['user'],password=cfg.db['passwd'],
                                      host=cfg.db['host'],database='sdb',
                                      auth_plugin='mysql_native_password')
        cursor = cnx.cursor(buffered=True)
        if args.ra is None:
            cursor.execute("SELECT sdbid FROM sdb_pm;")
        else:
            cursor.execute("SELECT sdbid FROM sdb_pm WHERE raj2000>{};".format(args.ra*15))

        for id in cursor:
            sdb_getphot_one(id[0], verb=args.verb)
        cursor.close()
        cnx.close()