diff --git a/lifesim/__init__.py b/lifesim/__init__.py index 0f813c4..c61fa63 100644 --- a/lifesim/__init__.py +++ b/lifesim/__init__.py @@ -13,4 +13,6 @@ from lifesim.optimize.optimizer import Optimizer from lifesim.optimize.ahgs import AhgsModule -from lifesim.gui.spectrum_gui import Gui \ No newline at end of file +from lifesim.gui.spectrum_gui import Gui + +from lifesim.analysis.yield_wrapper import ScienceYield diff --git a/lifesim/analysis/yield_wrapper.py b/lifesim/analysis/yield_wrapper.py new file mode 100644 index 0000000..0ee575a --- /dev/null +++ b/lifesim/analysis/yield_wrapper.py @@ -0,0 +1,1206 @@ +import time +import shutil +import os +import contextlib +from copy import deepcopy +import json +from typing import Union +import logging +import yaml + +import numpy as np +from joblib import Parallel, delayed, parallel_config +from joblib_progress import joblib_progress +import pandas as pd +from tqdm import tqdm +import matplotlib.pyplot as plt + +import lifesim + +class ScienceYield: + def __init__(self, + config_path, + catalog_path, + output_path, + n_cpu: int = 1, + cat_from_ppop: bool = True): + self.config_path = config_path + self.catalog_path = catalog_path + self.output_path = output_path + self.n_cpu = n_cpu + self.cat_from_ppop = cat_from_ppop + + # Create logger that writes to `yield_wrapper_log.txt` in append mode. + os.makedirs(self.output_path, exist_ok=True) + log_file_path = os.path.join(self.output_path, 'yield_wrapper.log') + + self.logger = logging.getLogger('lifesim.yield_wrapper') + self.logger.setLevel(logging.INFO) + + # Avoid duplicating handlers for the same file when multiple instances are created + file_path_abs = os.path.abspath(log_file_path) + existing_handler = None + for h in list(self.logger.handlers): + if isinstance(h, logging.FileHandler) and getattr(h, 'baseFilename', None) == file_path_abs: + existing_handler = h + break + + if existing_handler is None: + fh = logging.FileHandler(log_file_path, mode='a') + fh.setLevel(logging.INFO) + fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) + self.logger.addHandler(fh) + self.logger.propagate = False + + # Log instance creation and parameters + self.logger.info('%s', '=' * 80) + self.logger.info('ScienceYield instance created at: %s', time.ctime()) + self.logger.info('Parameters:') + self.logger.info(' config_path: %s', self.config_path) + self.logger.info(' catalog_path: %s', self.catalog_path) + self.logger.info(' output_path: %s', self.output_path) + self.logger.info(' n_cpu: %s', self.n_cpu) + + def _compute_snrs(self, + output_path, + output_filename, + run_maxsep, + diameter=None): + + print('START OF RUN: ', time.ctime()) + print('RUN NAME: ', output_filename) + print('----------------------------') + + t = time.time() + # create bus + bus = lifesim.Bus() + + # setting the options + bus.build_from_config(filename=self.config_path) + bus.data.options.set_manual(n_cpu=self.n_cpu) # speed up calculation + + if diameter is not None: + bus.data.options.set_manual(diameter=diameter) + + bus.data.options.set_manual( + output_path=output_path) + bus.data.options.set_manual(output_filename=output_filename) + + + # ---------- Loading the Catalog ---------- + if self.cat_from_ppop: + bus.data.catalog_from_ppop(input_path=self.catalog_path) + else: + bus.data.import_catalog(input_path=self.catalog_path) + + # ---------- Creating the Instrument ---------- + + # create modules and add to bus + instrument = lifesim.Instrument(name='inst') + bus.add_module(instrument) + + transm = lifesim.TransmissionMap(name='transm') + bus.add_module(transm) + + exo = lifesim.PhotonNoiseExozodi(name='exo') + bus.add_module(exo) + local = lifesim.PhotonNoiseLocalzodi(name='local') + bus.add_module(local) + star = lifesim.PhotonNoiseStar(name='star') + bus.add_module(star) + + # connect all modules + bus.connect(('inst', 'transm')) + bus.connect(('inst', 'exo')) + bus.connect(('inst', 'local')) + bus.connect(('inst', 'star')) + + bus.connect(('star', 'transm')) + + if run_maxsep: + # only run maxsep SNR for planets that are part of an experiment + bus.data.catalog['is_interesting'] = False + for exp in bus.data.options.optimization['experiments'].keys(): + mask_exp = ((bus.data.catalog.radius_p + >= bus.data.options.optimization['experiments'][exp]['radius_p_min']) + & (bus.data.catalog.radius_p + <= bus.data.options.optimization['experiments'][exp]['radius_p_max']) + & (bus.data.catalog.temp_s + >= bus.data.options.optimization['experiments'][exp]['temp_s_min']) + & (bus.data.catalog.temp_s + <= bus.data.options.optimization['experiments'][exp]['temp_s_max'])) + + if bus.data.options.optimization['experiments'][exp]['in_HZ']: + mask_exp = (mask_exp + & (bus.data.catalog['habitable'])) + + bus.data.catalog['exp_' + exp] = mask_exp + + bus.data.catalog['is_interesting'] = np.logical_or(mask_exp, bus.data.catalog['is_interesting']) + + bus.data.catalog = bus.data.catalog[bus.data.catalog.is_interesting] + bus.data.catalog['angsep'] = bus.data.catalog['maxangsep'] + + instrument.get_snr() + bus.save() + + del bus + + print('Generation took ', (time.time() - t) / 60, ' minutes to complete.') + + def run_aperture_sweep_snr(self, + mirror_diameters, + run_name): + + final_output_path = os.path.join(self.output_path, run_name) + if not os.path.exists(final_output_path): + os.makedirs(final_output_path) + else: + raise ValueError('Directory already exists: ' + final_output_path) + + for ndim, diameter in enumerate(mirror_diameters): + print('') + print('') + print('STARTING RUN FOR MIRROR DIAMETER: ', diameter) + print('AT TIME: ', time.ctime()) + + print('Preparing directories... ', end='') + + output_directory = os.path.join(final_output_path, 'diam_' + str(np.round(diameter, 2)).replace('.', '_')) + if not os.path.exists(output_directory): + os.makedirs(output_directory) + + print('[Done]') + + print('Commencing base run... ') + self._compute_snrs(output_path=f'{output_directory}/', + output_filename='sweep_diam_' + str(np.round(diameter, 2)).replace('.', '_'), + run_maxsep=False, + diameter=float(diameter)) + print('[Done]') + + print('Commencing maxsep run... ') + self._compute_snrs(output_path=f'{output_directory}/', + output_filename='sweep_diam_' + str(np.round(diameter, 2)).replace('.', '_') + '_maxsep', + run_maxsep=True, + diameter=float(diameter)) + print('[Done]') + + # Append single multi-line log summary for this method + t_end = time.time() + try: + diam_dirs = sorted([d for d in os.listdir(final_output_path) + if os.path.isdir(os.path.join(final_output_path, d)) and d.startswith('diam_')]) + diameters_created = [d.replace('diam_', '').replace('_', '.') for d in diam_dirs] + except Exception: + diam_dirs = [] + diameters_created = [] + msg = f"""run_aperture_sweep_snr summary: + run_name: {run_name} + requested_mirror_diameters: {mirror_diameters} + mirror_directories_found: {diam_dirs} + mirror_diameters_reported: {diameters_created} + final_output_path: {final_output_path} + start_time: {time.ctime(t_end - (t_end - t_end))} # placeholder, exact start not stored in this scope + end_time: {time.ctime(t_end)} + elapsed_seconds: {round(0.0, 2)} # elapsed not measured here to avoid changing existing code flow + """ + self.logger.info(msg) + + def run_optimizer_sweep(self, + run_name, + source_name, + characterization: bool = False, + opt_limit_factor: Union[None, float] = None, + reduce_catalog: bool = False): + source_path = os.path.join(self.output_path, source_name) + # get the names of all subdirectories in output_path (which contain subdirectories for different mirror diameters) + subdirs = [d for d in os.listdir(source_path) if os.path.isdir(os.path.join(source_path, d))] + + # check a directory with the name run_name already exists in output_path, otherwise create it + final_output_path = os.path.join(self.output_path, run_name) + if not os.path.exists(final_output_path): + os.makedirs(final_output_path) + else: + raise ValueError('Directory already exists: ' + final_output_path) + + run_configs = [] + + for subdir in subdirs: + # make a list of all files ending in .hdf5 in subdir, then keep only the part of the filename before '_catalog.hdf5' and only if the sting does not contain 'maxsep' + catalog_files = [f.split('_catalog.hdf5')[0] for f in os.listdir(os.path.join(source_path, subdir)) + if f.endswith('_catalog.hdf5') and 'maxsep' not in f] + + # create a subdir of the same name in final_output_path, no existence check required + output_directory = os.path.join(final_output_path, subdir) + os.makedirs(output_directory) + + for catalog_file in catalog_files: + if self.n_cpu == 1: + compute_yields_mp( + output_filename=catalog_file, + output_path=output_directory + '/', + catalog_path=os.path.join(source_path, subdir, catalog_file + '_catalog.hdf5'), + config_path=self.config_path, + characterization=characterization, + opt_limit_factor=opt_limit_factor, + reduce_catalog=reduce_catalog, + ) + else: + run_configs.append({'output_filename': catalog_file, + 'output_path': output_directory + '/', + 'catalog_path': os.path.join(source_path, subdir, catalog_file + '_catalog.hdf5'), + }) + + if self.n_cpu > 1: + with parallel_config( + backend="loky", inner_max_num_threads=1 + ), joblib_progress( + description="Running SNR calculation in parallel ...", + total=len(run_configs), + ): + Parallel(n_jobs=self.n_cpu)( + delayed(compute_yields_mp)( + output_filename=rc['output_filename'], + output_path=rc['output_path'], + catalog_path=rc['catalog_path'], + config_path=self.config_path, + characterization=characterization, + opt_limit_factor=opt_limit_factor, + reduce_catalog=reduce_catalog, + ) + for rc in run_configs) + + # Append single multi-line log summary for this method + t_end = time.time() + # derive counts from filesystem to avoid modifying existing logic + catalog_counts = 0 + catalog_list = {} + for d in subdirs: + files = [f for f in os.listdir(os.path.join(source_path, d)) if f.endswith('_catalog.hdf5') and 'maxsep' not in f] + catalog_counts += len(files) + catalog_list[d] = files + msg = f"""run_optimizer_sweep summary: + run_name: {run_name} + source_name: {source_name} + source_path: {source_path} + n_cpu: {self.n_cpu} + subdirs_found: {subdirs} + total_subdirs_count: {len(subdirs)} + total_catalog_files_count: {catalog_counts} + catalog_files_by_subdir: {catalog_list} + run_configs_queued_parallel: {len(run_configs)} + final_output_path: {final_output_path} + characterization: {characterization} + opt_limit_factor: {opt_limit_factor} + end_time: {time.ctime(t_end)} + """ + self.logger.info(msg) + + def combine_catalog_maxsep(self, + source_name,): + source_path = os.path.join(self.output_path, source_name) + + print('START OF combine_catalog_maxsep: ', time.ctime()) + print('SOURCE NAME: ', source_name) + print('SOURCE PATH: ', source_path) + t_all = time.time() + + subdirs = [d for d in os.listdir(source_path) if os.path.isdir(os.path.join(source_path, d))] + + for subdir in subdirs: + print('--------------------------------------------------') + print('Processing subdir: ', subdir) + t_sub = time.time() + + # make a list of all files ending in .hdf5 in subdir, then keep only the part of the filename before '_catalog.hdf5' and only if the sting does not contain 'maxsep' + base_catalog_files = [f.split('_catalog.hdf5')[0] for f in os.listdir(os.path.join(source_path, subdir)) + if f.endswith('_catalog.hdf5') and 'maxsep' not in f] + + if len(base_catalog_files) == 0: + print(' No base catalog files found in', os.path.join(source_path, subdir)) + print(' Skipping subdir.') + print('Subdir processing took', round(time.time() - t_sub, 2), 's') + continue + + for catalog_file in base_catalog_files: + print(' ----------------------------------------------') + print(' Processing catalog:', catalog_file) + t_cat = time.time() + + base_path = os.path.join(source_path, subdir, catalog_file + '_catalog.hdf5') + maxsep_path = os.path.join(source_path, subdir, catalog_file + '_maxsep_catalog.hdf5') + + print(' Reading base catalog from:', base_path) + base_catalog = pd.read_hdf(base_path) + print(' Base catalog entries:', len(base_catalog)) + + print(' Reading maxsep catalog from:', maxsep_path) + maxsep_catalog = pd.read_hdf(maxsep_path) + print(' Maxsep catalog entries:', len(maxsep_catalog)) + + # map and fill missing values + base_catalog['maxsep_snr_1h'] = base_catalog['id'].map(maxsep_catalog.set_index('id')['snr_1h']) + missing_before = int(base_catalog['maxsep_snr_1h'].isna().sum()) + base_catalog['maxsep_snr_1h'].fillna(0, inplace=True) + print(f' Mapped maxsep snr_1h, filled {missing_before} missing values with 0') + + print(' Saving combined catalog to:', base_path) + base_catalog.to_hdf(base_path, key='catalog', mode='w') + + print(' Done processing', catalog_file, '- took', round(time.time() - t_cat, 2), 's') + + print('Finished subdir:', subdir, '- took', round(time.time() - t_sub, 2), 's') + + print('ALL combine_catalog_maxsep finished. Total time:', round(time.time() - t_all, 2), 's') + + # Append single multi-line log summary for this method + t_end = time.time() + # derive file counts from filesystem (no change to processing logic) + subdirs_list = [d for d in os.listdir(source_path) if os.path.isdir(os.path.join(source_path, d))] + total_base_catalogs = sum(len([f for f in os.listdir(os.path.join(source_path, d)) + if f.endswith('_catalog.hdf5') and 'maxsep' not in f]) for d in subdirs_list) + total_maxsep_catalogs = sum(len([f for f in os.listdir(os.path.join(source_path, d)) + if f.endswith('_maxsep_catalog.hdf5')]) for d in subdirs_list) + msg = f"""combine_catalog_maxsep summary: + source_name: {source_name} + source_path: {source_path} + subdirs_found: {subdirs_list} + subdirs_total: {len(subdirs_list)} + total_base_catalog_files: {total_base_catalogs} + total_maxsep_catalog_files: {total_maxsep_catalogs} + start_time: {time.ctime(t_all)} + end_time: {time.ctime(t_end)} + elapsed_seconds: {round(t_end - t_all, 2)} + """ + self.logger.info(msg) + + def get_mission_time(self, + catalog_path, + config_path): + bus = lifesim.Bus() + + # loading the config and catalog, make sure that the catalog is already combined with maxsep SNRs + bus.build_from_config(filename=config_path) + bus.data.import_catalog(input_path=catalog_path) + + # collect the experiments + exps = [col[4:] for col in bus.data.catalog.columns if col.startswith('exp_')] + + # recalculate the interesting flag + bus.data.catalog['is_interesting'] = False + for exp in exps: + mask_exp = ((bus.data.catalog.radius_p + >= bus.data.options.optimization['experiments'][exp]['radius_p_min']) + & (bus.data.catalog.radius_p + <= bus.data.options.optimization['experiments'][exp]['radius_p_max']) + & (bus.data.catalog.temp_s + >= bus.data.options.optimization['experiments'][exp]['temp_s_min']) + & (bus.data.catalog.temp_s + <= bus.data.options.optimization['experiments'][exp]['temp_s_max'])) + + if bus.data.options.optimization['experiments'][exp]['in_HZ']: + mask_exp = (mask_exp + & (bus.data.catalog['habitable'])) + + bus.data.catalog['exp_' + exp] = mask_exp + + bus.data.catalog['is_interesting'] = np.logical_or(mask_exp, bus.data.catalog['is_interesting']) + + cat_det = bus.data.catalog[np.logical_and( + bus.data.catalog.is_interesting, + bus.data.catalog.detected + )].sort_values('t_detected') + + cat_det['follow_up'] = False + cat_det['t_orbit'] = 0. + cat_det['t_char'] = 0. + + bus.data.catalog['follow_up'] = False + + # set up the time sheet that records the mission time per universe and per experiment + columns = ['detection', 'orbit', 'characterization', 'total'] + for exp in exps: + columns.append('n_' + exp) + time_sheet = pd.DataFrame(index=np.unique(cat_det.nuniverse), + columns=columns) + + # 1. get total time for detection campaign from every universe + t_det = bus.data.catalog.t_detected.max() + time_sheet['detection'] = t_det + + # 2. identify the follow_up targets for each universe + cat_det.sort_values('maxsep_snr_1h', ascending=False, inplace=True) + for nu in np.unique(cat_det.nuniverse): + for exp in exps: + mask_det = np.logical_and.reduce((cat_det.nuniverse == nu, + cat_det['exp_' + exp], + cat_det.t_detected <= t_det)) + # only set the top N targets to follow up where N is the sample size for the experiment + mask_det_indices = cat_det[mask_det].index[:bus.data.options.optimization['experiments'][exp]['sample_size']] + cat_det.loc[mask_det_indices, 'follow_up'] = True + time_sheet.loc[nu, 'n_' + exp] = len(mask_det_indices) + + # 3. calculate follow-up time for orbit and characterization + cat_det.loc[cat_det.follow_up, 't_orbit'] = ( + (((bus.data.options.optimization['snr_target'] / cat_det[cat_det.follow_up].maxsep_snr_1h) ** 2) + * 60 * 60 + + bus.data.options.array['t_slew']) + * (bus.data.options.optimization['n_orbits'] - 1) + ) + + cat_det.loc[cat_det.follow_up, 't_char'] = ( + (((bus.data.options.optimization['snr_char'] / cat_det[cat_det.follow_up].maxsep_snr_1h) ** 2) + * 60 * 60 + + bus.data.options.array['t_slew']) + ) + + for nu in np.unique(cat_det.nuniverse): + mask_followup = np.logical_and.reduce( + (cat_det.nuniverse == nu, cat_det.follow_up, + )) + time_sheet.loc[nu, 'orbit'] = cat_det.loc[mask_followup, 't_orbit'].sum() + time_sheet.loc[nu, 'characterization'] = cat_det.loc[mask_followup, 't_char'].sum() + + # 4. get total time + time_sheet['total'] = time_sheet['detection'] + time_sheet['orbit'] + time_sheet['characterization'] + + # 5. copy to original catalog + cols = ['follow_up', 't_orbit', 't_char'] + mapping_df = cat_det.set_index('id')[cols] + + for col, fill_value, out_type in [ + ('follow_up', False, bool), + ('t_orbit', 0.0, float), + ('t_char', 0.0, float), + ]: + # map and infer object dtypes first + s = bus.data.catalog['id'].map(mapping_df[col]).infer_objects(copy=False) + # replace missing values without using .fillna + s_filled = s.where(s.notna(), other=fill_value) + # assign with desired type + bus.data.catalog[col] = s_filled.astype(out_type) + + bus.save() + + return time_sheet + + def sweep_mission_time(self, + source_name, + all_runs: bool = False): + if all_runs: + # an optimized run directory always starts with 'opt_' + # run_names = [d for d in os.listdir(self.output_path) if os.path.isdir(os.path.join(self.output_path, d))] + + run_names = [d for d in os.listdir(self.output_path) + if os.path.isdir(os.path.join(self.output_path, d)) + and d.startswith('opt_')] + + for run_name in run_names: + self.sweep_mission_time(source_name=run_name, + all_runs=False) + self.process_mission_time(source_name=run_name) + + else: + source_path = os.path.join(self.output_path, source_name) + subdirs = [d for d in os.listdir(source_path) if os.path.isdir(os.path.join(source_path, d))] + + for subdir in subdirs: + print('--------------------------------------------------') + print('Processing subdir: ', subdir) + t_sub = time.time() + + # make a list of all files ending in .hdf5 in subdir, then keep only the part of the filename before '_catalog.hdf5' and only if the sting does not contain 'maxsep' + run_names = [f.split('_catalog.hdf5')[0] for f in os.listdir(os.path.join(source_path, subdir)) + if f.endswith('_catalog.hdf5')] + + for run_name in run_names: + print(' ----------------------------------------------') + print(' Processing run: ', run_name) + t_run = time.time() + + catalog_path = os.path.join(source_path, subdir, run_name + '_catalog.hdf5') + config_path = os.path.join(source_path, subdir, run_name + '.yaml') + + time_sheet = self.get_mission_time(catalog_path=catalog_path, + config_path=config_path) + + # save time_sheet to csv files, one per experiment + output_csv_path = os.path.join(source_path, subdir, run_name + '_mission_time.csv') + time_sheet.to_csv(output_csv_path) + print(' Saved mission time to ', output_csv_path) + + print(' Done processing run:', run_name, '- took', round(time.time() - t_run, 2), 's') + + # Append single multi-line log summary for this method + t_end = time.time() + subdirs_list = [d for d in os.listdir(source_path) if os.path.isdir(os.path.join(source_path, d))] + total_runs = sum(len([f for f in os.listdir(os.path.join(source_path, d)) if f.endswith('_catalog.hdf5')]) for d in subdirs_list) + mission_csvs = sum(len([f for f in os.listdir(os.path.join(source_path, d)) if f.endswith('_mission_time.csv')]) for d in subdirs_list) + msg = f"""sweep_mission_time summary: + source_name: {source_name} + source_path: {source_path} + subdirs_found: {subdirs_list} + subdirs_total: {len(subdirs_list)} + total_runs_catalogs_found: {total_runs} + mission_time_csvs_found: {mission_csvs} + end_time: {time.ctime(t_end)} + """ + self.logger.info(msg) + + def process_mission_time(self, + source_name): + # Start logging for this processing run (style consistent with other functions) + print('START OF process_mission_time: ', time.ctime()) + source_path = os.path.join(self.output_path, source_name) + print('SOURCE NAME: ', source_name) + print('SOURCE PATH: ', source_path) + t_all = time.time() + + subdirs = [d for d in os.listdir(source_path) if os.path.isdir(os.path.join(source_path, d))] + diams_float = [float('.'.join(d.split('_')[1:])) for d in subdirs] + diams = ['_'.join(d.split('_')[1:]) for d in subdirs] + + # -- 1. CREATE TYPETABLE -- + timetable = {} + + for subdir, d in zip(subdirs, diams): + csv_file = [f for f in os.listdir(os.path.join(source_path, subdir)) + if f.endswith('.csv')] + if len(csv_file) != 1: + raise ValueError('More than one csv file found in subdir ' + subdir) + timetable[d] = pd.read_csv(os.path.join(source_path, subdir, csv_file[0])) + + exps = [c.split('n_')[1] for c in timetable[diams[0]].columns if c.startswith('n_')] + + typetable = pd.DataFrame(columns=['detection', 'orbit', 'char', 'total'], index=diams_float) + + for d, df in zip(diams, diams_float): + typetable.loc[df, 'detection'] = np.max(timetable[d]['detection']) * 1.25 + typetable.loc[df, 'orbit'] = np.mean(timetable[d]['orbit']) * 1.25 + typetable.loc[df, 'char'] = np.mean(timetable[d]['characterization']) * 1.25 + typetable.loc[df, 'total'] = typetable.loc[df, 'detection'] + typetable.loc[df, 'orbit'] + typetable.loc[ + df, 'char'] + + typetable.sort_index(inplace=True, ascending=True) + + # -- 2. PLOT TYPETABLE -- + # fill under stepped line + columns = ['detection', 'orbit', 'char'] + columns_label = ['Detection', 'Orbit', 'Characterization'] + x = np.asarray(typetable.index, dtype=float) + y0 = np.zeros_like(x, dtype=float) + + colors = ['#2066a8', '#3594cc', '#8cc5e3'] + + fig, ax = plt.subplots() + + for i in range(len(columns)): + y = typetable[columns[i]].to_numpy(dtype=float) / (365.25 * 24 * 60 * 60) + # ax.step(x, y+y0, label=columns[i], where='mid', color=colors[i]) + ax.fill_between(x, y0, y + y0, step='mid', alpha=1, color=colors[i], label=columns_label[i], edgecolor=None) + + y0 += typetable[columns[i]].to_numpy(dtype=float) / (365.25 * 24 * 60 * 60) + + ax.axhline(y=5, color='tab:blue', linestyle='--') + ax.set_xlabel('Mirror Diameter (m)') + ax.set_ylabel('Time (years)') + ax.legend() + type_fig_path = os.path.join(self.output_path, source_name, source_name + '_by_type_time.pdf') + fig.savefig(type_fig_path) + plt.close() + + # Save typetable CSV and report as a single cohesive step + typetable_csv_path = os.path.join(self.output_path, source_name, source_name + '_typetable.csv') + + # -- 3. SAVE TYPETABLE -- + typetable /= (365.25 * 24 * 60 * 60) + typetable.to_csv(typetable_csv_path) + print('--------------------------------------------------') + print('Step 1/2: Typetable generated and saved.') + print(' Figure: ', type_fig_path) + print(' CSV: ', typetable_csv_path) + + # -- 4. CREATE EXPTABLE -- + exptable = {} + + for exp in exps: + exptable[exp] = pd.DataFrame( + columns=['detection', 'orbit', 'orbit-1s', 'orbit+1s', 'char', 'char-1s', 'char+1s', 'total', + 'total-1s', 'total+1s'], index=diams_float) + exptable[exp].sort_index(inplace=True) + for subdir in subdirs: + df = float('.'.join(subdir.split('_')[1:])) + catalog_file = [f for f in os.listdir(os.path.join(source_path, subdir)) + if f.endswith('.hdf5') and 'maxsep' not in f] + if len(catalog_file) != 1: + raise ValueError('More than one catalog file found in subdir ' + subdir) + config_file = [f for f in os.listdir(os.path.join(source_path, subdir)) + if f.endswith('.yaml')] + + bus = lifesim.Bus() + + # loading the config and catalog, make sure that the catalog is already combined with maxsep SNRs + bus.build_from_config( + filename=os.path.join(source_path, subdir, config_file[0])) + bus.data.import_catalog( + input_path=os.path.join(source_path, subdir, catalog_file[0])) + cat = bus.data.catalog + + for exp in exps: + mask = np.logical_and.reduce((cat.detected, cat['exp_' + exp])) + exptable[exp].loc[df, 'detection'] = (np.sum(np.unique(cat[mask].int_time)) + len( + np.unique(cat[mask].int_time)) * bus.data.options.array['t_slew']) * 1.25 + + mask = np.logical_and.reduce((cat.detected, cat['exp_' + exp], cat.follow_up)) + temp_t_orbit = [] + temp_t_char = [] + temp_t_total = [] + for nu in np.unique(cat[mask].nuniverse): + temp_t_orbit.append(np.sum(cat[np.logical_and(mask, cat.nuniverse == nu)].t_orbit)) + temp_t_char.append(np.sum(cat[np.logical_and(mask, cat.nuniverse == nu)].t_char)) + temp_t_total.append(np.sum(cat[np.logical_and(mask, cat.nuniverse == nu)].t_orbit) + np.sum( + cat[np.logical_and(mask, cat.nuniverse == nu)].t_char) + exptable[exp].loc[ + df, 'detection'] * 0.8) + exptable[exp].loc[df, 'orbit'] = np.mean(temp_t_orbit) * 1.25 + exptable[exp].loc[df, 'orbit+1s'] = np.quantile(temp_t_orbit, 0.841) * 1.25 + exptable[exp].loc[df, 'orbit-1s'] = np.quantile(temp_t_orbit, 0.159) * 1.25 + exptable[exp].loc[df, 'char'] = np.mean(temp_t_char) * 1.25 + exptable[exp].loc[df, 'char+1s'] = np.quantile(temp_t_char, 0.841) * 1.25 + exptable[exp].loc[df, 'char-1s'] = np.quantile(temp_t_char, 0.159) * 1.25 + exptable[exp].loc[df, 'total'] = np.mean(temp_t_total) * 1.25 + exptable[exp].loc[df, 'total+1s'] = np.quantile(temp_t_total, 0.841) * 1.25 + exptable[exp].loc[df, 'total-1s'] = np.quantile(temp_t_total, 0.159) * 1.25 + + del bus + del cat + + # -- 5. PLOT EXPTABLE -- + colors = [['#2066a8', '#3594cc', '#8cc5e3'], + ['#a00000', '#c46666', '#d8a6a6'], + ['#1f6f6f', '#54a1a1', '#9fc8c8'], ] + + columns = ['detection', 'orbit', 'char'] + try: + x = np.asarray(exptable[exps[0]].index, dtype=float) + except: + x = np.asarray(exptable[exps].index, dtype=float) + y0 = np.zeros_like(x, dtype=float) + + fig, ax = plt.subplots() + + for j, exp in enumerate(exps): + for i in range(len(columns)): + y = exptable[exp][columns[i]].to_numpy(dtype=float) / (365.25 * 24 * 60 * 60) + # ax.step(x, y+y0, label=columns[i], where='mid', color=colors[i]) + if i == 0: + ax.fill_between(x, y0, y + y0, step='mid', alpha=1, color=colors[j][i], edgecolor=None, label=exp) + else: + ax.fill_between(x, y0, y + y0, step='mid', alpha=1, color=colors[j][i], edgecolor=None) + y0 += y + + ax.set_xlabel('Mirror Diameter (m)') + ax.set_ylabel('Time (years)') + ax.axhline(y=5, color='tab:blue', linestyle='--') + ax.legend() + + exptable_fig_path = os.path.join(self.output_path, source_name, source_name + '_by_exp_time.pdf') + fig.savefig(exptable_fig_path) + plt.close() + + # -- 6. SAVE EXPTABLE -- + for exp in exps: + exptable[exp] /= (365.25 * 24 * 60 * 60) + exptable[exp].to_csv(os.path.join(self.output_path, source_name, source_name + '_' + exp + '_exptable.csv')) + + print('--------------------------------------------------') + print('Step 2/2: Exptable plotted and CSVs saved.') + print(' Figure: ', exptable_fig_path) + print(' CSVs: ', os.path.join(self.output_path, source_name)) + + # Final log with elapsed time + print('ALL process_mission_time finished. Total time:', round(time.time() - t_all, 2), 's') + + def mange_optimizations(self, + scenario_csv: str, + source_name: str, + csv_has_header: bool = True): + + header_arg = 0 if csv_has_header else None + + scenario_df = pd.read_csv(scenario_csv, header=header_arg) + + def _create_short_name(row): + parts = [] + + # 1. Handle Booleans: Add tag only if True + if row['Experiment_1']: + parts.append("e1") + if row['Experiment_2']: + parts.append("e2") + + # 2. Handle Float: Add 'f' prefix and remove decimal (0.5 -> 05) + # converting to string and replacing '.' is a robust way to handle this + factor_str = str(row['opt_limit_factor']).replace('.', '') + parts.append(f"f{factor_str}") + + # 3. Handle Characterization + if row['characterization']: + parts.append("char") + + # Join all parts with underscores + return "_".join(parts) + + # Apply the function row by row + scenario_df['filename'] = scenario_df.apply(_create_short_name, axis=1) + + # load base config file + with open(self.config_path, 'r') as f: + base_config = yaml.safe_load(f) + original_config_path = deepcopy(self.config_path) + + for i in range(len(scenario_df)): + run_name = 'opt_' + scenario_df.iloc[i]['filename'] + + # check if a directory with the name run_name already exists in output_path, otherwise create it + final_output_path = os.path.join(self.output_path, run_name) + + row = scenario_df.iloc[i] + + # create the custom config file based on the existing config file + custom_config = deepcopy(base_config) + + # Loop through the items in the row + for key, value in row.items(): + # CASE A: It is an Experiment toggle + if key.startswith('Experiment'): + if not value: + # If False, remove it from the dictionary safely + # .pop(key, None) prevents a KeyError if the key is already gone + custom_config['optimization']['experiments'].pop(key, None) + # If True, we do nothing (keep it) + + # CASE B: It is a parameter update (e.g., opt_limit_factor) + # We check if this key exists in the 'optimization' block to overwrite it + elif key in custom_config['optimization']: + custom_config['optimization'][key] = float(value) + + # save the custom config file to a temporary location + temp_config_path = os.path.join(self.output_path, 'temp_config_' + run_name + '.yaml') + with open(temp_config_path, 'w') as f: + yaml.dump(custom_config, f) + self.config_path = temp_config_path + + print('Starting optimization for run:', run_name) + + self.run_optimizer_sweep( + run_name=run_name, + source_name=source_name, + characterization=bool(row['characterization']), + opt_limit_factor=float(row['opt_limit_factor']), + reduce_catalog=True + ) + + # save the custom config file to final_output_path + custom_config_path = os.path.join(final_output_path, 'config_' + run_name + '.yaml') + with open(custom_config_path, 'w') as f: + yaml.dump(custom_config, f) + + print('Finished optimization for run:', run_name) + print('----------------------------------------') + + #delete the temporary config file + os.remove(temp_config_path) + + self.config_path = original_config_path + + + + def run_covergence_test(self, + run_name, + catalog_path, + min_universes, + num_steps, + plot=False): + + # check a directory with the name run_name already exists in output_path, otherwise create it + output_path = os.path.join(self.output_path, run_name) + if not os.path.exists(output_path): + os.makedirs(output_path) + else: + raise ValueError('Directory already exists: ' + output_path) + + output_path += '/' + + # determine number universes + catalog = pd.read_hdf(catalog_path, key='catalog') + total_universes = np.unique(catalog.nuniverse).shape[0] + del catalog + + if min_universes > total_universes-1: + raise ValueError('min_universes must be less than the total number of universes in the catalog.') + + if num_steps is None: + num_steps = total_universes - min_universes + 1 + elif num_steps > total_universes - min_universes + 1: + raise ValueError('num_steps is too large for the given min_universes and total universes in the catalog.') + + uni_sel = np.linspace(min_universes, total_universes, num_steps, dtype=int) + + if self.n_cpu > 1: + with parallel_config( + backend="loky", inner_max_num_threads=1 + ), joblib_progress( + description="Running SNR calculation in parallel ...", + total=len(uni_sel), + ): + results = Parallel(n_jobs=self.n_cpu)( + delayed(compute_yields_mp)( + output_filename=run_name + '_nuni_' + str(us), + output_path=output_path, + catalog_path=catalog_path, + config_path=self.config_path, + uni_sel=us, + return_yields=True + ) + for us in uni_sel) + else: + results = [] + for us in tqdm(uni_sel): + res = compute_yields_mp( + output_filename=run_name + '_nuni_' + str(us), + output_path=output_path, + catalog_path=catalog_path, + config_path=self.config_path, + uni_sel=us, + return_yields=True + ) + + results.append(res) + + experiments = list(results[0][1].keys()) + yield_convergence = {exp: {'n_universes': [], + 'mean_yield': [], + 'p16_yield': [], + 'p84_yield': []} + for exp in experiments} + for res in results: + n_uni, yields = res + for exp in experiments: + yield_convergence[exp]['n_universes'].append(n_uni) + yield_convergence[exp]['mean_yield'].append(yields[exp][0]) + yield_convergence[exp]['p16_yield'].append(yields[exp][1]) + yield_convergence[exp]['p84_yield'].append(yields[exp][2]) + + # save yield_convergence to a json file + with open(os.path.join(output_path, run_name + '_yield_convergence.json'), 'w') as f: + json.dump(yield_convergence, f, indent=4) + + if plot: + fig, ax = plt.subplots(figsize=(8, 6)) + for exp in experiments: + ax.plot(yield_convergence[exp]['n_universes'], + yield_convergence[exp]['mean_yield'], + label=exp) + ax.fill_between(yield_convergence[exp]['n_universes'], + yield_convergence[exp]['p16_yield'], + yield_convergence[exp]['p84_yield'], + alpha=0.3) + ax.set_xlabel('Number of Universes') + ax.set_ylabel('Yield') + ax.set_title('Yield Convergence Test') + ax.legend() + plt.savefig(os.path.join(output_path, run_name + '_yield_convergence.pdf')) + plt.close() + + +def get_yields(bus, + return_yields=False): + + yields = {} + + for exp in bus.data.options.optimization['experiments'].keys(): + mask = np.logical_and.reduce((bus.data.catalog.temp_s + >= bus.data.options.optimization['experiments'][exp]['temp_s_min'], + bus.data.catalog.temp_s + <= bus.data.options.optimization['experiments'][exp]['temp_s_max'], + bus.data.catalog.radius_p + >= bus.data.options.optimization['experiments'][exp]['radius_p_min'], + bus.data.catalog.radius_p + <= bus.data.options.optimization['experiments'][exp]['radius_p_max'], + bus.data.catalog.detected + )) + if bus.data.options.optimization['experiments'][exp]['in_HZ']: + mask = np.logical_and(mask, + bus.data.catalog.habitable) + + _, tyield = np.unique(bus.data.catalog[mask].nuniverse, return_counts=True) + + if tyield.size != 0: + yields[exp] = [float(np.mean(tyield)), float(np.percentile(tyield, 15.9)), float(np.percentile(tyield, 84.1))] + else: + yields[exp] = [0.0, 0.0, 0.0] + + return yields + +def compute_yields_mp(output_filename, + output_path, + catalog_path, + config_path, + uni_sel=None, + return_yields=False, + characterization: bool = False, + opt_limit_factor: Union[None, float] = None, + reduce_catalog: bool = False): + + t = time.time() + # create bus + bus = lifesim.Bus() + + # setting the options + bus.build_from_config(filename=config_path) + bus.data.options.set_manual(n_cpu=1) # speed up calculation + + if opt_limit_factor is not None: + bus.data.options.optimization['opt_limit_factor'] = opt_limit_factor + + bus.data.options.set_manual(output_path=output_path) + bus.data.options.set_manual(output_filename=output_filename) + bus.data.options.optimization['characterization'] = characterization + # ---------- Loading the Catalog ---------- + bus.data.import_catalog(input_path=catalog_path) + + if uni_sel is not None: + selected_universes = np.random.choice(np.unique(bus.data.catalog.nuniverse), replace=False, size=uni_sel) + bus.data.catalog = bus.data.catalog[np.isin(bus.data.catalog.nuniverse, selected_universes)] + + # ---------- Creating the Instrument ---------- + + # create modules and add to bus + instrument = lifesim.Instrument(name='inst') + bus.add_module(instrument) + + transm = lifesim.TransmissionMap(name='transm') + bus.add_module(transm) + + # connect all modules + bus.connect(('inst', 'transm')) + + # optimizing the result + opt = lifesim.Optimizer(name='opt') + bus.add_module(opt) + ahgs = lifesim.AhgsModule(name='ahgs') + bus.add_module(ahgs) + + bus.connect(('transm', 'opt')) + bus.connect(('inst', 'opt')) + bus.connect(('opt', 'ahgs')) + + # with contextlib.redirect_stdout(None): + opt.ahgs() + + if reduce_catalog: + to_remove = ['planet_flux_use', 'ecc_p', 'noise_astro', 'fp', 'small_omega_p', 'semimajor_p', 'dec', 'theta_p', + 'sep_p', 'albedo_geom_vis', 'z', 'photon_rate_noise', 'mass_p', 'mass_s', 'p_orb', 'flux_p', + 'photon_rate_planet', 'albedo_bond', 'inc_p', 'lat', 'stype', 'name_s', 'lon', 'ra', + 'albedo_geom_mir', 'large_omega_p', 'radius_s'] + bus.data.catalog.drop(columns=to_remove, inplace=True, errors='ignore') + + bus.save() + + if return_yields: + yields = get_yields(bus=bus, + return_yields=return_yields) + return int(uni_sel), yields + +def merge_runs(mapping_csv: str, + merge_csv: str, + output_path: str, + csv_has_header: bool = True, + id_prefixes: tuple = ('1', '2')) -> None: + """Merge sets of HDF catalogs described by two CSV files. + + Differences vs previous implementation: + - mapping_csv maps short name -> input directory (each input directory contains subdirectories like diam_2, diam_2_5, ...) + - merge_csv rows give two short names and an output name. For each row: + - create output_path// and an info file there + - create output_path//ap_merged/ + - verify both input directories contain exactly the same subdirs + - for each subdir, read the single base catalog file (ends with '_catalog.hdf5' and not containing 'maxsep') from both inputs, + remap ids with id_prefixes, concatenate, and write to ap_merged// + - do not read/merge any maxsep files + """ + import datetime + + # sanity checks for csvs and output dir + if not os.path.exists(mapping_csv): + raise FileNotFoundError(f"Mapping CSV file not found: {mapping_csv}") + if not os.path.exists(merge_csv): + raise FileNotFoundError(f"Merge CSV file not found: {merge_csv}") + + os.makedirs(output_path, exist_ok=True) + + header_arg = 0 if csv_has_header else None + + # read mapping csv and build dict short_name -> path (expected to be directories) + mapping_df = pd.read_csv(mapping_csv, header=header_arg) + if mapping_df.shape[1] < 2: + raise ValueError("Mapping CSV must contain at least two columns: short_name, path") + short_names = mapping_df.iloc[:, 0].astype(str).tolist() + paths = mapping_df.iloc[:, 1].astype(str).tolist() + mapping = {} + for s, p in zip(short_names, paths): + if s in mapping: + print(f"Warning: duplicate short name '{s}' in mapping CSV; using first occurrence.") + continue + mapping[s] = p + + # read merge csv + merge_df = pd.read_csv(merge_csv, header=header_arg) + if merge_df.shape[1] < 3: + raise ValueError("Merge CSV must contain at least three columns: short1, short2, output_name") + + input1_short = merge_df.iloc[:, 0].astype(str).tolist() + input2_short = merge_df.iloc[:, 1].astype(str).tolist() + output_names = merge_df.iloc[:, 2].astype(str).tolist() + + # helper to read HDF with fallback key + def _read_hdf_try(path): + try: + return pd.read_hdf(path) + except (KeyError, ValueError): + try: + return pd.read_hdf(path, key='catalog') + except Exception as e: + raise RuntimeError(f"Failed to read HDF file {path}: {e}") + + for idx, (s1, s2, out_name) in enumerate(zip(input1_short, input2_short, output_names), start=1): + print(f"Processing row {idx}:\n input_short1={s1}\n input_short2={s2}\n output_name={out_name}") + + if s1 not in mapping: + raise KeyError(f"Short name '{s1}' not found in mapping CSV") + if s2 not in mapping: + raise KeyError(f"Short name '{s2}' not found in mapping CSV") + + dir1 = mapping[s1] + dir2 = mapping[s2] + + if not os.path.exists(dir1) or not os.path.isdir(dir1): + raise FileNotFoundError(f"Input directory 1 not found or not a directory: {dir1}") + if not os.path.exists(dir2) or not os.path.isdir(dir2): + raise FileNotFoundError(f"Input directory 2 not found or not a directory: {dir2}") + + # normalize output folder and create structure + out_folder = os.path.join(output_path, out_name) + os.makedirs(out_folder, exist_ok=True) + ap_merged_root = os.path.join(out_folder, 'ap_merged') + os.makedirs(ap_merged_root, exist_ok=True) + + info_lines = [] + now = datetime.datetime.utcnow().isoformat() + 'Z' + info_lines.append(f"Merged catalogs run on: {now}") + info_lines.append(f"Input short1: {s1}") + info_lines.append(f" path: {dir1}") + info_lines.append(f"Input short2: {s2}") + info_lines.append(f" path: {dir2}") + info_lines.append("") + + # list subdirectories (only directories) + subdirs1 = sorted([d for d in os.listdir(dir1) if os.path.isdir(os.path.join(dir1, d))]) + subdirs2 = sorted([d for d in os.listdir(dir2) if os.path.isdir(os.path.join(dir2, d))]) + + info_lines.append(f"Subdirectories in {s1}: {subdirs1}") + info_lines.append(f"Subdirectories in {s2}: {subdirs2}") + + if set(subdirs1) != set(subdirs2): + info_lines.append("") + info_lines.append("ERROR: Input directories do not contain the same set of subdirectories.") + note_path = os.path.join(out_folder, 'merge_info.txt') + with open(note_path, 'w') as fh: + fh.write('\n'.join(info_lines)) + raise ValueError(f"Subdirectory mismatch between {dir1} and {dir2}. Merge aborted. See {note_path} for details.") + + common_subdirs = sorted(subdirs1) # they are equal sets + + # process each subdir + for sub in common_subdirs: + info_lines.append("") + info_lines.append(f"Processing subdir: {sub}") + in_sub1 = os.path.join(dir1, sub) + in_sub2 = os.path.join(dir2, sub) + + # find single base catalog file in each (ends with _catalog.hdf5 and not containing 'maxsep') + base_files1 = [f for f in os.listdir(in_sub1) if f.endswith('_catalog.hdf5') and 'maxsep' not in f] + base_files2 = [f for f in os.listdir(in_sub2) if f.endswith('_catalog.hdf5') and 'maxsep' not in f] + + if len(base_files1) != 1: + info_lines.append(f" ERROR: expected exactly one base catalog in {in_sub1}, found {len(base_files1)}") + continue + if len(base_files2) != 1: + info_lines.append(f" ERROR: expected exactly one base catalog in {in_sub2}, found {len(base_files2)}") + continue + + bf1 = base_files1[0] + bf2 = base_files2[0] + info_lines.append(f" catalog file in {s1}: {bf1}") + info_lines.append(f" catalog file in {s2}: {bf2}") + + # ensure filenames match (same run name). If not, still proceed but preserve filenames (user expects consistent naming) + # create output subdir under ap_merged + out_sub = os.path.join(ap_merged_root, sub) + os.makedirs(out_sub, exist_ok=True) + + path1 = os.path.join(in_sub1, bf1) + path2 = os.path.join(in_sub2, bf2) + + try: + cat1 = _read_hdf_try(path1) + cat2 = _read_hdf_try(path2) + except Exception as e: + info_lines.append(f" ERROR reading catalogs: {e}") + continue + + info_lines.append(f" rows before merge: {s1}: {len(cat1)}, {s2}: {len(cat2)}") + + if 'id' not in cat1.columns or 'id' not in cat2.columns: + info_lines.append(" ERROR: both catalogs must contain an 'id' column. Skipping this subdir.") + continue + + # remap ids and keep originals + try: + cat1 = cat1.copy() + cat2 = cat2.copy() + cat1['id_orig__'] = cat1['id'] + cat2['id_orig__'] = cat2['id'] + # coerce to int then prefix as string and convert back to int + cat1['id'] = cat1['id'].astype(int).astype(str).apply(lambda s: int(str(id_prefixes[0]) + s)) + cat2['id'] = cat2['id'].astype(int).astype(str).apply(lambda s: int(str(id_prefixes[1]) + s)) + except Exception as e: + info_lines.append(f" ERROR remapping ids: {e}") + continue + + if cat1['id'].duplicated().any(): + info_lines.append(" ERROR: duplicate ids found within remapped catalog1. Skipping this subdir.") + continue + if cat2['id'].duplicated().any(): + info_lines.append(" ERROR: duplicate ids found within remapped catalog2. Skipping this subdir.") + continue + + merged = pd.concat([cat1, cat2], ignore_index=True, sort=False) + + out_catalog_path = os.path.join(out_sub, bf1) # preserve filename of first input's base file + try: + merged.to_hdf(out_catalog_path, key='catalog', mode='w') + except Exception as e: + info_lines.append(f" ERROR writing merged catalog to {out_catalog_path}: {e}") + continue + + info_lines.append(f" wrote merged catalog to: {out_catalog_path}") + info_lines.append(f" rows after merge: {len(merged)}") + + # write info file + note_path = os.path.join(out_folder, 'merge_info.txt') + with open(note_path, 'w') as fh: + fh.write('\n'.join(info_lines)) + + print(f"Finished processing merge row {idx}. Info written to: {note_path}") diff --git a/lifesim/analysis/yield_wrapper_notes.md b/lifesim/analysis/yield_wrapper_notes.md new file mode 100644 index 0000000..9802b19 --- /dev/null +++ b/lifesim/analysis/yield_wrapper_notes.md @@ -0,0 +1,171 @@ +# `ScienceYield` Wrapper + +The purpose of the science yield wrapper is to collect all functions necessary to run a yield with LIFEsim. +While detection yields with single catalogs or instrument setups can be quickly run using +`instrument.snr()` and `opt.ahgs()`, this wrapper is aimed at +- yields for varying instrument setups (e.g. aperture size) +- yields including the characterization yield + +Central tools to achieve this are +- a table of all synthetic planets, to which LIFEsim adds the `snr_1h` column. We call this _snr table_. +- the optimizer, which based on the `snr_1h` column creates an optimized observing sequence + +## Current State +So far, the ScienceYield wrapper has grown organically. +Here, we provide an overview of the current state. + +--- + +## `ScienceYield` + +### Constructor + +#### `__init__(self, ...)` + +**Description:** +Sets up the working environment, mainly paths to file locations. + +**Arguments:** + +| Argument | Type | Description | +|-----------------|------|----------------------------------------------| +| `config_path` | str | Path to the config file in .yaml format | +| `catalog_path` | str | Path to the input catalog | +| `output_path` | str | Location where the snr catalogs are saved | +| `n_cpu` | int | Number of CPUs to be used in multiprocessing | +| `cat_from_ppop` | bool | True if input catalog is in .txt format | + +**Example:** +```python +import lifesim +ywrap = lifesim.ScienceYield(config_path='/path/CONFIG_251104.yaml', + catalog_path='/path/Bryson2021_hab2low.txt', + output_path='/path/hab2low_500/', + n_cpu=64) +``` + +#### `_compute_snrs(self, ...)` + +**Description:** +Calculates on individual SNR table for one setting of the instrument. +Saves a catalog with `snr_1h` and a config file called `output_filename_catalog.hdf5` and `output_filename.yaml` to `output_path`. +This function is mainly meant to be an internal helper function, it is usually not required to use it. + +**Arguments:** + +| Argument | Type | Description | +|-------------------|------|---------------------------------------------------------------------------------| +| `output_path` | str | Path to where this individual run is saved | +| `output_filename` | str | Filename of catalog and config table | +| `run_maxsep` | bool | If true, all planets will be placed in their maximum separation | +| `diameter` | int | Mirror diameter for this run. If set to None, diameter from config file is used | + +**Files created:** + +| Name / Location | Content and Purpose | +|-------------------------------------------|--------------------------------| +| `output_path/output_filename_catalog.hdf5`| snr table for the single run | +| `output_path/output_filename.yaml` |config file for the single run | + +**Example:** +(continued from above) +```python +ywrap.compute_snrs(output_path='path/', + output_filename='single_test', + run_maxsep=False, + diameter=None) +``` + +#### `run_aperture_sweep_snr(self, ...)` + +**Description:** +One of the key instrument parameters is the sensitivity. +For LIFE, we usually fix the photon-conversion-efficiency, and the aperture is varied. +This function creates snr tables for an array of mirror diameters. + +**Arguments:** + +| Argument | Type | Description | +|--------------------|------------------|-------------------------------------------| +| `mirror_diameters` | list/np.ndarray | Array of all mirror sizes to be evaluated | +| `run_name` | str | Name of the run (for file creation) | + +**Files created:** + +For every diameter in `mirror_diameters`, the following files will be created (here we take 2.5m as example) + +| Name / Location | Content and Purpose | +|-------------------------------------------------------------------------|-----------------------------------------------------------------------------------------| +| `self.output_path/run_name/diam_2_5/` | new parent directory is created | +| `self.output_path/run_name/diam_2_5/sweep_diam_2_5.yaml` | _config file_ for run with the actual planet position for the detection phase | +| `self.output_path/run_name/diam_2_5/sweep_diam_2_5_catalog.hdf5` | ↑ and its _snr catalog_ | +| `self.output_path/run_name/diam_2_5/sweep_diam_maxsep_2_5.yaml` | _config file_ for run with the maximum planet separation for the characterization phase | +| `self.output_path/run_name/diam_2_5/sweep_diam_maxsep_2_5_catalog.hdf5` | ↑ and its _snr catalog_ | + +**Example:** +(continued from above) +```python +ywrap.run_aperture_sweep_snr(mirror_diameters=np.arange(2.0, 5.1, 0.25), + run_name='aperture_size_2_to_5') +``` + +#### `run_optimizer_sweep(self, ...)` + +**Description:** +Runs the optimizer on all snr tables in a given source folder. + +**Arguments:** + +| Argument | Type | Description | +|---------------|------|-----------------------------------------------------------------------| +| `run_name` | str | Name of the run (for file creation) | +| `source_name` | str | Name of the folder in self.output_path that contains the _snr tables_ | + +**Files created:** + +| Name / Location | Content and Purpose | +|----------------------------------------------------------------|----------------------------------------------------------------| +| `self.output_path/run_name/` | new parent directory is created, if it does not already exists | +| `self.output_path/run_name/subdir/` | New directory for every directory in the source folder | +| `self.output_path/run_name/subdir/sweep_diam_2_5.yaml` | _config file_ for run of optimizer | +| `self.output_path/run_name/subdir/sweep_diam_2_5_catalog.hdf5` | ↑ and its _snr catalog_ with optimized sequence | + +**Example:** +(continued from above) +```python +ywrap.run_optimizer_sweep(run_name='opt_aperture_size_2_to_5', + source_name='aperture_size_2_to_5') +``` + +--- + +## Helper Functions + +#### `compute_yields_mp()` + +**Description:** +Runs the optimizer on all snr tables in a given source folder. + +**Arguments:** + +| Argument | Type | Description | +|-------------------|----------|-------------------------------------------------------------------------------| +| `output_filename` | str | Name of output files | +| `output_path` | str | Path to which catalog and config file will be saved | +| `catalog_path` | str | Path to the input catalog file | +| `config_path` | str | Path to the config file | +| `uni_sel` | int/None | Number of universes to select from all universes. If `None`, all are selected | +| `return_yields` | bool | Set true to return experiment yield of this run | + +**Files created:** + +| Name / Location | Content and Purpose | +|-------------------------------------------|----------------------------------| +| `output_path/output_filename_catalog.hdf5`| snr table for the single run | +| `output_path/output_filename.yaml` | config file for the single run | + + +# What happens in the characterization ahgs + + + diff --git a/lifesim/core/core.py b/lifesim/core/core.py index dd5a009..2cfe9f9 100644 --- a/lifesim/core/core.py +++ b/lifesim/core/core.py @@ -6,7 +6,7 @@ import os import yaml -from numpy import ndarray, array +from numpy import ndarray, array, float64, float32, int64, int32, int16 import git from lifesim.core.data import Data @@ -442,6 +442,8 @@ def add_module(self, module.data = self.data def write_config(self): + add_numpy_representers() + module_dict = {key: str(type(module)) for (key, module) in self.modules.items()} # fetch version number and git commit hash if available @@ -475,7 +477,7 @@ def write_config(self): def build_from_config(self, filename: str): with open(filename) as file: - config_dict = yaml.load(file, Loader=yaml.FullLoader) + config_dict = yaml.load(file, Loader=yaml.UnsafeLoader) self.data.options.array = convert_to_np(config_dict['array']) self.data.options.optimization = convert_to_np(config_dict['optimization']) @@ -520,3 +522,14 @@ def convert_to_np(inp: dict): else: out[k] = inp[k] return out + +def add_numpy_representers(): + def numpy_repr(dumper, data): + return dumper.represent_data(data.item()) + + # Add for various numpy types + for dtype in [float64, float32, int64, int32, int16]: + yaml.add_representer(dtype, numpy_repr) + + # Handle numpy arrays specifically + yaml.add_representer(ndarray, lambda dumper, data: dumper.represent_list(data.tolist())) diff --git a/lifesim/instrument/instrument.py b/lifesim/instrument/instrument.py old mode 100644 new mode 100755 index 4000c67..013cae7 --- a/lifesim/instrument/instrument.py +++ b/lifesim/instrument/instrument.py @@ -1,9 +1,14 @@ from warnings import warn +from copy import deepcopy +from typing import List, Optional, Sequence, Any import numpy as np +import pandas as pd from tqdm import tqdm from spectres import spectres from PyQt5.QtGui import QGuiApplication +from joblib import Parallel, delayed, parallel_config +from joblib_progress import joblib_progress from lifesim.core.modules import InstrumentModule from lifesim.util.habitable import single_habitable_zone @@ -167,10 +172,11 @@ def get_wl_bins_const_spec_res(self): # TODO does not take the inclination into account! def adjust_bl_to_hz(self, hz_center: float, - distance_s: float): + distance_s: float, + l_sun: float): """ Adjusts the baseline of the array to be optimal for observations in the habitable zone of - the target star for the selected optimal wavelength. + the target star for the given optimisation model. Parameters ---------- @@ -179,14 +185,43 @@ def adjust_bl_to_hz(self, distance_s : float Distance between the observed star and the LIFE array in [pc]. """ + opt_model = self.data.options.models["baseline_optimisation"] # convert the habitable zone to radians hz_center_rad = hz_center / distance_s / (3600 * 180) * np.pi # in rad - # put first transmission peak of optimal wl on center of HZ - # for the origin of the value 0.5.. see Ottiger+2021 - baseline = (0.589645 / hz_center_rad - * self.data.options.other['wl_optimal'] * 10 ** (-6)) + if opt_model == "ref_wl": + # put first transmission peak of optimal wl on center of HZ + # for the origin of the value 0.5.. see Ottiger+2021 + k = 0.589645 * self.data.options.other['wl_optimal'] * 10 ** (-6) + + elif opt_model == "Kepler": + # Baseline optimisation considering the semi-major axis distribution of the Kepler + # occurance rates (SAG13/Bryson Hab2Max) and orbit projection effects, + # approximated via a polynomial fit to an MC analysis + sqL = l_sun**0.5 + if sqL > 0.28: #FGKs + cs = np.array([-2.5655e-1,2.0727e-1,4.8682e-2,-4.0867e-4,-2.1587e-2,9.3433])*1e-6 + else: + cs = np.array([-7.4078,1.4269e1,1.1875e-1,-2.3090e-3,-6.9372e-3,9.7141])*1e-6 + k = cs[0]*sqL + cs[1]*sqL**2 + cs[2]*distance_s + cs[3]*distance_s**2 + cs[4]*sqL*distance_s + cs[5] + + elif opt_model == "Uniform": + # Baseline optimisation considering a uniform semi-major axis distribution and orbit + # projection effects, approximated via a polynomial fit to an MC analysis + sqL = l_sun**0.5 + if sqL > 0.28: #FGKs + cs = np.array([1.9934e-1,6.4546e-2,2.6083e-2,-8.1794e-5,-1.7035e-2,8.3007])*1e-6 + else: #Ms + cs = np.array([-2.9216,4.4454,6.5858e-2,-1.4577e-3,2.8476e-2,8.4299])*1e-6 + k = cs[0]*sqL + cs[1]*sqL**2 + cs[2]*distance_s + cs[3]*distance_s**2 + cs[4]*sqL*distance_s + cs[5] + + baseline = k / hz_center_rad + + # If discrete, set baseline to nearest value + if self.data.options.models["discrete_baselines"]: + bls = np.array(self.data.options.array['bl_discrete']) + baseline = bls[np.abs(bls-baseline).argmin()] self.apply_baseline(baseline=baseline) @@ -228,7 +263,9 @@ def apply_baseline(self, def get_snr(self, save_mode: bool = False): """ - Calculates the one-hour signal-to-noise ration for all planets in the catalog. + Calculates the one-hour signal-to-noise ration for all planets in the catalog. Switches between single and + multi processing depending on the n_cpu option. + Parameters ---------- safe_mode : bool @@ -239,12 +276,6 @@ def get_snr(self, # options are applied before the simulation run self.apply_options() - # currently, the choice of integration time here is arbitrary. Since the background limited - # case is assumed, the SNR scales with sqrt(integration time) and through this, the SNR - # for any integration time can be calculated by knowing the SNR of a specific integration - # time - integration_time = 60 * 60 - self.data.catalog['snr_1h'] = np.zeros_like(self.data.catalog.nstar, dtype=float) self.data.catalog['baseline'] = np.zeros_like(self.data.catalog.nstar, dtype=float) if save_mode: @@ -253,20 +284,46 @@ def get_snr(self, self.data.catalog['photon_rate_planet'] = None self.data.catalog['photon_rate_noise'] = None + if self.data.options.other['n_cpu'] == 1: + self.get_snr_single_processing(save_mode=save_mode) + elif self.data.options.other['n_cpu'] > 1: + self.get_snr_multi_processing(save_mode=save_mode) + else: + raise ValueError('n_cpu option must be >= 1') + + def get_snr_single_processing(self, + save_mode: bool = False, + verbose : bool = True): + """ + Calculates the one-hour signal-to-noise ration for all planets in the catalog. + Parameters + ---------- + safe_mode : bool + If save mode is enables, the individual photon counts of the planet and noise sources + are written to the catalog. + """ + + # currently, the choice of integration time here is arbitrary. Since the background limited + # case is assumed, the SNR scales with sqrt(integration time) and through this, the SNR + # for any integration time can be calculated by knowing the SNR of a specific integration + # time + integration_time = 60 * 60 + # create mask returning only unique stars _, temp = np.unique(self.data.catalog.nstar, return_index=True) star_mask = np.zeros_like(self.data.catalog.nstar, dtype=bool) star_mask[temp] = True # iterate over all stars to calculate noise specific to stars - for i, n in enumerate(tqdm(np.where(star_mask)[0])): + for i, n in enumerate(tqdm(np.where(star_mask)[0], disable=not verbose)): # if i == 10: # break nstar = self.data.catalog.nstar.iloc[n] # adjust baseline of array and give new baseline to transmission generator plugin self.adjust_bl_to_hz(hz_center=float(self.data.catalog.hz_center.iloc[n]), - distance_s=float(self.data.catalog.distance_s.iloc[n])) + distance_s=float(self.data.catalog.distance_s.iloc[n]), + l_sun=float(self.data.catalog.l_sun.iloc[n]),) # get transmission map _, _, self.data.inst['t_map'], _, _ = self.run_socket(s_name='transmission', @@ -304,10 +361,10 @@ def get_snr(self, self.data.catalog.nuniverse == nuniverse))[0][0] noise_bg_universe_temp = (noise_bg_universe * self.data.catalog.z.iloc[n_u] - / self.data.catalog.z.iloc[n]) + / self.data.catalog.z.iloc[n]) noise_bg = ((noise_bg_star + noise_bg_universe_temp) - * integration_time * self.data.inst['eff_tot'] * 2) + * integration_time * self.data.inst['eff_tot'] * 2) # go through all planets for the chosen star for _, n_p in enumerate(np.argwhere( @@ -344,29 +401,68 @@ def get_snr(self, # Add up the noise and caluclate the SNR noise = noise_bg + noise_planet - self.data.catalog.snr_1h.iat[n_p] = np.sqrt((flux_planet ** 2 / noise).sum()) + + # use index label to avoid chained assignment / view-copy problems + idx_label = self.data.catalog.index[n_p] + self.data.catalog.loc[idx_label, 'snr_1h'] = np.sqrt((flux_planet ** 2 / noise).sum()) # save baseline - self.data.catalog['baseline'].iat[n_p] = self.data.inst['bl'] + self.data.catalog.loc[idx_label, 'baseline'] = self.data.inst['bl'] if save_mode: - self.data.catalog.noise_astro.iat[n_p] = [noise_bg] - self.data.catalog.planet_flux_use.iat[n_p] = ( + self.data.catalog.loc[idx_label, 'noise_astro'] = [noise_bg] + self.data.catalog.loc[idx_label, 'planet_flux_use'] = ( [flux_planet_thermal * integration_time * self.data.inst['eff_tot'] * self.data.inst['telescope_area']]) - self.data.catalog['photon_rate_planet'].iat[n_p] = ( + self.data.catalog.loc[idx_label, 'photon_rate_planet'] = ( flux_planet / integration_time / self.data.inst['eff_tot'] ).sum() - self.data.catalog['photon_rate_noise'].iat[n_p] = ( + self.data.catalog.loc[idx_label, 'photon_rate_noise'] = ( noise / integration_time / self.data.inst['eff_tot'] ).sum() + # ...existing code... + + def get_snr_multi_processing(self, + save_mode: bool = False): + + # divide the catalog into roughly equal chunks for each cpu + n_star, occ_star = np.unique(self.data.catalog.nstar, return_counts=True) + star_groups = balanced_partition_greedy(occ=occ_star, items=n_star, n_groups=self.data.options.other['n_cpu']*10) + + sub_catalogs = [self.data.catalog[np.isin(self.data.catalog.nstar, sg)] for sg in star_groups] + group_sizes = [len(sc) for sc in sub_catalogs] + per_dev = (np.max(group_sizes) - np.min(group_sizes)) / np.mean(group_sizes) + print(f'Maximum deviation in group sizes: {per_dev*100:.1f}%') + + reference_bus = deepcopy(self) + del reference_bus.data.catalog + + with parallel_config( + backend="loky", inner_max_num_threads=1 + ), joblib_progress( + description="Running SNR calculation in parallel ...", + total=len(star_groups), + ): + results = Parallel(n_jobs=self.data.options.other['n_cpu'])( + delayed(mp_runner)( + bus=reference_bus, + catalog=sc, + save_mode=save_mode + ) + for sc in sub_catalogs) + + # combine results back into main catalog + self.data.catalog = pd.concat(results) + self.data.catalog.sort_values('id', inplace=True) + + # TODO: fix units in documentation def get_spectrum(self, temp_s: float, # in K @@ -428,12 +524,6 @@ def get_spectrum(self, Returns the noise contribution in [photons] """ - # TODO: remove by 2024 - warn('The get_spectrum function was implemented with a major bug between versions 0.2.16 ' - 'and 0.2.24 in which the noise level was twice as large as the correct value. If ' - 'you created results with the versions in question, please validate them with the ' - 'latest version of LIFEsim.') - # options are applied before the simulation run self.apply_options() @@ -514,7 +604,8 @@ def get_spectrum(self, else: # adjust baseline to HZ self.adjust_bl_to_hz(hz_center=hz_center, - distance_s=distance_s) + distance_s=distance_s, + l_sun=l_sun) # calculate the transmission map _, _, self.data.inst['t_map'], _, _ = self.run_socket(s_name='transmission', @@ -689,7 +780,8 @@ def get_signal(self, # adjust baseline to habitable zone self.adjust_bl_to_hz(hz_center=hz_center, - distance_s=distance_s) + distance_s=distance_s, + l_sun=l_sun) # use spectres to rescale the spectrum onto the correct wl bins flux_planet_spectrum_input = flux_planet_spectrum @@ -768,78 +860,63 @@ def get_signal(self, return signal, flux_planet +def mp_runner(bus, + catalog, + save_mode: bool = False): + bus = deepcopy(bus) + bus.data.catalog = catalog + bus.get_snr_single_processing(save_mode=save_mode, + verbose=False) + return bus.data.catalog + +def balanced_partition_greedy(occ: Sequence[int], + n_groups: int, + items: Optional[Sequence[Any]] = None) -> List[List[Any]]: + """ + Greedy multi-way partition using NumPy for faster min selection. + Assigns the largest items first to the current smallest-sum group. -def multiprocessing_runner(input_dict: dict): - # create mask returning only unique stars - universes = np.unique( - input_dict['catalog'].nuniverse[input_dict['catalog'].nstar == input_dict['nstar']], - return_index=False - ) - - # get transmission map - _, _, self.data.inst['t_map'], _, _ = self.run_socket(s_name='transmission', - method='transmission_map', - map_selection='tm3') - - for nuniverse in universes: - inst.z = input_dict['catalog'][np.logical_and(input_dict['catalog'].nstar == input_dict['nstar'], - input_dict['catalog'].nuniverse == nuniverse)].z.iloc[0] - - # redo calculation for exozodi - inst.create_exozodi() - inst.sensitivity_coefficients(exozodi_only=True) - inst.fundamental_noise(exozodi_only=True) - - # go through all planets for the chosen star - for _, n_p in enumerate(np.argwhere( - np.logical_and(input_dict['catalog'].nstar.to_numpy() == input_dict['nstar'], - input_dict['catalog'].nuniverse.to_numpy() == nuniverse))[:, 0]): - inst.temp_planet = input_dict['catalog']['temp_p'].iloc[n_p] - inst.radius_planet = input_dict['catalog']['radius_p'].iloc[n_p] - inst.separation_planet = (input_dict['catalog']['angsep'].iloc[n_p] - * input_dict['catalog']['distance_s'].iloc[n_p]) - - # ----- must be repeated for every planet ----- - inst.create_planet(force=True) - inst.planet_signal() - - if (inst.chopping == 'nchop'): - inst.sn_nchop() - else: - inst.sn_chop() - - # save baseline - input_dict['catalog']['baseline'].iat[n_p] = input_dict['baseline'] - - # save snr results - if (inst.chopping == 'nchop'): - input_dict['catalog'].t_rot.iat[n_p] = input_dict['integration_time'] - input_dict['catalog'].signal.iat[n_p] = inst.photon_rates.loc['signal', 'nchop'].sum() - input_dict['catalog'].photon_noise.iat[n_p] = ( - np.sqrt((inst.photon_rates.loc['pn', 'nchop'] ** 2).sum())) - input_dict['catalog'].systematic_noise.iat[n_p] = ( - np.sqrt((inst.photon_rates.loc['sn', 'nchop'] ** 2).sum())) - else: - input_dict['catalog'].t_rot.iat[n_p] = input_dict['integration_time'] - input_dict['catalog'].signal.iat[n_p] = inst.photon_rates.loc['signal', 'chop'].sum() - input_dict['catalog'].photon_noise.iat[n_p] = ( - np.sqrt((inst.photon_rates.loc['pn', 'chop'] ** 2).sum())) - input_dict['catalog'].systematic_noise.iat[n_p] = ( - np.sqrt((inst.photon_rates.loc['sn', 'chop'] ** 2).sum())) - - if input_dict['safe_mode']: - if (inst.chopping == 'nchop'): - input_dict['noise_catalog'].loc[input_dict['catalog']['id'].iloc[n_p]] = ( - inst.photon_rates.nchop) - else: - input_dict['noise_catalog'].loc[input_dict['catalog']['id'].iloc[n_p]] = ( - inst.photon_rates.chop) - - return_dict = {'catalog': input_dict['catalog']} - if input_dict['safe_mode']: - return_dict['noise_catalog'] = input_dict['noise_catalog'] - - return return_dict - - - + Parameters + ---------- + occ : Sequence[int] + Sequence containing the weight (e.g. number of sub-objects) for each item. + n_groups : int + Number of groups to partition into. Must be >= 1. + items : Optional[Sequence[Any]] + Sequence of items corresponding to `occ`. If ``None``, the function will use the + indices ``range(len(occ))`` as the items. + + Returns + ------- + List[List[Any]] + List of length ``n_groups`` where each element is a list of the assigned items + (or indices if ``items`` was ``None``). Group totals are balanced using a greedy + heuristic; some groups may remain empty if ``n_groups`` > ``len(occ)``. + + Raises + ------ + ValueError + If ``n_groups`` < 1 or if ``items`` is provided but its length does not match ``occ``. + """ + if n_groups <= 0: + raise ValueError("n_groups must be >= 1") + n = len(occ) + if items is None: + items = list(range(n)) + if len(items) != n: + raise ValueError("items and occ must have same length") + + occ_arr = np.asarray(occ, dtype=np.int64) + # sort indices by descending weight (largest first) + idxs = np.argsort(-occ_arr) + + groups: List[List[Any]] = [[] for _ in range(n_groups)] + sums = np.zeros(n_groups, dtype=np.int64) + + # assign each item to the group with the smallest current sum + for i in idxs: + g = int(np.argmin(sums)) # fast C-level operation + groups[g].append(items[int(i)]) + sums[g] += int(occ_arr[int(i)]) + + return groups diff --git a/lifesim/optimize/ahgs.py b/lifesim/optimize/ahgs.py index 742e31c..4fd339f 100644 --- a/lifesim/optimize/ahgs.py +++ b/lifesim/optimize/ahgs.py @@ -17,15 +17,42 @@ def obs_array_star(self, nstar): self.data.catalog.is_interesting)) if not np.any(mask): - return np.array((np.inf, np.inf)) + + if self.data.options.optimization['characterization']: + return np.array(((np.inf, np.inf), (np.inf, np.inf))) + else: + return np.array((np.inf, np.inf)) else: obs = (60 * 60 * (self.data.options.optimization['snr_target'] ** 2 - self.data.catalog['snr_current'].loc[mask] ** 2) / self.data.catalog.snr_1h.loc[mask] ** 2) - obs -= self.data.catalog.t_slew.loc[mask] - obs = np.sort(obs) / np.arange(1, np.count_nonzero(mask) + 1, 1) - return obs + + # included slew time + # needs to be divided by the number of universes for proper optimization, since the characterization happens + # 'per universe' + if self.data.options.optimization['characterization']: + obs_char = (60 * 60 * + self.data.options.optimization['snr_char'] ** 2 + / self.data.catalog.maxsep_snr_1h.loc[mask] ** 2 + + self.data.options.array['t_slew']) / self.data.optm['num_universe'] + + # characterization time is needed for optimization, detection time is needed to understand how much + # mission time was used for one observation + met = np.stack((obs, obs_char), axis=0) + + met = met[:, np.argsort(met[0])] + met[1] = np.cumsum(met[1]) + met[0] -= self.data.catalog.t_slew.loc[mask] + met[1] += met[0] + met = met / np.arange(1, np.count_nonzero(mask) + 1, 1)[np.newaxis, :] + + return met + else: + obs -= self.data.catalog.t_slew.loc[mask] + obs = np.sort(obs) / np.arange(1, np.count_nonzero(mask) + 1, 1) + + return obs def observe_star(self, nstar, @@ -55,29 +82,41 @@ def observe_star(self, * np.sqrt(int_actual / (60 * 60)))**2) + # Changed: use label-based .loc to avoid chained-assignment warnings for _, i in enumerate(np.where(mask)[0]): - if (not self.data.catalog.detected.iloc[i]) and \ - (self.data.catalog.snr_current.iloc[i] + idx = self.data.catalog.index[i] + if (not self.data.catalog.loc[idx, 'detected']) and \ + (self.data.catalog.loc[idx, 'snr_current'] >= self.data.options.optimization['snr_target']): - self.data.catalog.detected.iat[i] = True - self.data.catalog.t_detected.iat[i] = deepcopy(self.tot_time) + self.data.catalog.loc[idx, 'detected'] = True + self.data.catalog.loc[idx, 't_detected'] = deepcopy(self.tot_time + int_time) exp_cols = [col for col in self.data.catalog.columns if col.startswith('exp_')] - true_experiments = [col[4:] for col in exp_cols if self.data.catalog.at[i, col]] + true_experiments = [col[4:] for col in exp_cols if self.data.catalog.loc[idx, col]] - for exp in true_experiments: + for exp in true_experiments[1:]: self.data.optm['exp_detected'][exp] += 1 + self.data.optm['exp_detected_uni'][exp][ + 1, + self.data.optm['exp_detected_uni'][exp][0, :] == self.data.catalog.loc[idx, 'nuniverse'] + ] += 1 else: raise ValueError('Delete mode not implemented for AHGS optimizer.') def distribute_time(self): stars, n = np.unique(ar=self.data.catalog.nstar, return_counts=True) - obs = np.zeros((stars.shape[0], np.max(n))) + np.inf - - # fill the observation time array - for i, nstar in enumerate(stars): - temp = self.obs_array_star(nstar=nstar) - obs[i, :temp.shape[0]] = temp + if self.data.options.optimization['characterization']: + obs = np.zeros((stars.shape[0], 2, np.max(n))) + np.inf + # fill the observation time array + for i, nstar in enumerate(stars): + temp = self.obs_array_star(nstar=nstar) + obs[i, :, :temp.shape[1]] = temp + else: + obs = np.zeros((stars.shape[0], np.max(n))) + np.inf + # fill the observation time array + for i, nstar in enumerate(stars): + temp = self.obs_array_star(nstar=nstar) + obs[i, :temp.shape[0]] = temp obs_time = (self.data.options.optimization['t_search'] * self.data.options.array['t_efficiency']) @@ -86,45 +125,91 @@ def distribute_time(self): print('Number of planets detected for each experiment:') - while self.tot_time < obs_time: - # find the best global slope and observe star - no_star, ind_t = np.unravel_index(np.argmin(obs), obs.shape) - if (self.tot_time + obs[no_star, ind_t] * (ind_t + 1) + 0.01) > obs_time: - rem_time = obs_time - self.tot_time - self.observe_star(nstar=stars[no_star], - int_time=rem_time) - self.tot_time += rem_time + run_bool = True + + while run_bool: + if self.data.options.optimization['characterization']: + # find the best global slope and observe star + no_star, ind_t = np.unravel_index(np.argmin(obs[:, 1, :]), obs[:, 1, :].shape) + if not np.isfinite(obs[no_star, 1, ind_t]): + print('Not sufficient targets remaining to continue characterization optimization.') + break + if (((self.tot_time + obs[no_star, 0, ind_t] * (ind_t + 1) + 0.01) > obs_time) + and (self.data.options.optimization['opt_limit'] == 'time')): + rem_time = obs_time - self.tot_time + self.observe_star(nstar=stars[no_star], + int_time=rem_time) + self.tot_time += rem_time + else: + if ((obs[no_star, 1, ind_t] - obs[no_star, 0, ind_t]) * (ind_t + 1) + 0.01) < 0: + raise ValueError('Negative time difference encountered.') + self.observe_star(nstar=stars[no_star], + int_time=obs[no_star, 0, ind_t] * (ind_t + 1) + 0.01) + temp = self.obs_array_star(nstar=stars[no_star]) + self.tot_time += obs[no_star, 0, ind_t] * (ind_t + 1) + 0.01 + obs[no_star, :, :] = np.inf + obs[no_star, :, :temp.shape[1]] = temp else: - self.observe_star(nstar=stars[no_star], - int_time=obs[no_star, ind_t] * (ind_t + 1) + 0.01) - temp = self.obs_array_star(nstar=stars[no_star]) - self.tot_time += obs[no_star, ind_t] * (ind_t + 1) + 0.01 - obs[no_star, :] = np.inf - obs[no_star, :temp.shape[0]] = temp + # find the best global slope and observe star + no_star, ind_t = np.unravel_index(np.argmin(obs), obs.shape) + if (((self.tot_time + obs[no_star, ind_t] * (ind_t + 1) + 0.01) > obs_time) + and (self.data.options.optimization['opt_limit'] == 'time')): + rem_time = obs_time - self.tot_time + self.observe_star(nstar=stars[no_star], + int_time=rem_time) + self.tot_time += rem_time + else: + self.observe_star(nstar=stars[no_star], + int_time=obs[no_star, ind_t] * (ind_t + 1) + 0.01) + temp = self.obs_array_star(nstar=stars[no_star]) + self.tot_time += obs[no_star, ind_t] * (ind_t + 1) + 0.01 + obs[no_star, :] = np.inf + obs[no_star, :temp.shape[0]] = temp out_string = '' for key in self.data.optm['exp_detected'].keys(): out_string += (key + ': ' + str(self.data.optm['exp_detected'][key] / self.data.optm['num_universe']) + ' ') - out_string += ('- (' + str(np.round(self.tot_time / 60 / 60 / 24 / 365.25, decimals=1)) + ' / ' - + str(np.round(obs_time / 60 / 60 / 24 / 365.25, decimals=1)) - + ') yrs observed') + + if self.data.options.optimization['opt_limit'] == 'experiments': + out_string += ('- ' + str(np.round(self.tot_time / 60 / 60 / 24 / 365.25, decimals=1)) + + ' yrs observed') + else: + out_string += ('- (' + str(np.round(self.tot_time / 60 / 60 / 24 / 365.25, decimals=1)) + ' / ' + + str(np.round(obs_time / 60 / 60 / 24 / 365.25, decimals=1)) + + ') yrs observed') print('\r' + out_string, end='') - if any( - self.data.optm['exp_detected'][exp] / self.data.optm['num_universe'] - > self.data.options.optimization['experiments'][exp]['sample_size'] - and not self.data.optm['hit_limit'][exp] - for exp in self.data.optm['exp_detected'] - ): + # if any( + # self.data.optm['exp_detected'][exp] / self.data.optm['num_universe'] + # > self.data.options.optimization['experiments'][exp]['sample_size'] + # and not self.data.optm['hit_limit'][exp] + # for exp in self.data.optm['exp_detected'] + # ): + if any([ + ((self.data.optm['exp_detected_uni'][exp][1, :] + > self.data.options.optimization['experiments'][exp]['sample_size']).sum() > + (self.data.options.optimization['opt_limit_factor'] + * self.data.optm['num_universe'])) and not self.data.optm['hit_limit'][exp] + for exp in self.data.optm['exp_detected_uni']]): + # over_limit_experiments = [ + # exp for exp in self.data.optm['exp_detected'] + # if + # self.data.optm['exp_detected'][exp] > self.data.options.optimization['experiments'][exp][ + # 'sample_size'] + # and not self.data.optm['hit_limit'][exp] + # ] over_limit_experiments = [ - exp for exp in self.data.optm['exp_detected'] + exp for exp in self.data.optm['exp_detected_uni'] if - self.data.optm['exp_detected'][exp] > self.data.options.optimization['experiments'][exp][ - 'sample_size'] + (self.data.optm['exp_detected_uni'][exp][1, :] + > self.data.options.optimization['experiments'][exp]['sample_size']).sum() > + (self.data.options.optimization['opt_limit_factor'] + * self.data.optm['num_universe']) and not self.data.optm['hit_limit'][exp] ] + for exp in over_limit_experiments: self.data.optm['hit_limit'][exp] = True self.data.catalog['is_interesting'] = False @@ -133,16 +218,31 @@ def distribute_time(self): self.data.catalog['exp_' + exp_interesting]) if self.data.catalog['is_interesting'].sum() == 0: - print('\nAll experiments have been completed, spending remaining mission time on all HZ planets.') + if self.data.options.optimization['opt_limit'] == 'time': + print('\nAll experiments have been completed, spending remaining mission time on all HZ planets.') self.data.catalog['is_interesting'] = self.data.catalog['habitable'] else: print('\nCompleted experiments: ' + ', '.join(over_limit_experiments) + ', RECOUNTING -------------------') - obs = np.zeros((stars.shape[0], np.max(n))) + np.inf - # fill the observation time array - for i, nstar in enumerate(stars): - temp = self.obs_array_star(nstar=nstar) - obs[i, :temp.shape[0]] = temp + if self.data.options.optimization['characterization']: + obs = np.zeros((stars.shape[0], 2, np.max(n))) + np.inf + # fill the observation time array + for i, nstar in enumerate(stars): + temp = self.obs_array_star(nstar=nstar) + obs[i, :, :temp.shape[1]] = temp + else: + obs = np.zeros((stars.shape[0], np.max(n))) + np.inf + # fill the observation time array + for i, nstar in enumerate(stars): + temp = self.obs_array_star(nstar=nstar) + obs[i, :temp.shape[0]] = temp + + if self.data.options.optimization['opt_limit'] == 'time': + run_bool = self.tot_time < obs_time + elif self.data.options.optimization['opt_limit'] == 'experiments': + run_bool = not all(self.data.optm['hit_limit'].values()) + else: + raise ValueError('Optimization limit not recognized.') \ No newline at end of file diff --git a/lifesim/optimize/ahgs_char.py b/lifesim/optimize/ahgs_char.py new file mode 100644 index 0000000..dcb51fc --- /dev/null +++ b/lifesim/optimize/ahgs_char.py @@ -0,0 +1,170 @@ +import numpy as np + +from lifesim.core.modules import SlopeModule + +class AhgsCharModule(SlopeModule): + def __init__(self, + name: str): + super().__init__(name=name) + self.t_series = [] + self.yield_series = {'A': [], + 'F': [], + 'G': [], + 'K': [], + 'M': []} + self.id_series = [] + + def obs_array_star(self, nstar): + mask = self.data.catalog.nstar == nstar + + # return infinity if the detection limit is reached for this stype + if not bool(np.isin(element=self.data.catalog.stype.loc[mask].iloc[0], + test_elements=np.array(list(self.data.options.optimization['limit'].keys()))[np.invert( + self.data.optm['hit_limit'])])): + return np.array((np.inf, np.inf)) + else: + mask = np.logical_and.reduce((mask, + self.data.catalog.habitable, + np.invert(self.data.catalog.detected))) + + obs = (60 * 60 * + (self.data.options.optimization['snr_target'] ** 2 + - self.data.catalog['snr_current'].loc[mask] ** 2) + / self.data.catalog.snr_1h.loc[mask] ** 2) + + obs_char = (60 * 60 * + self.data.options.optimization['snr_char'] ** 2 + / self.data.catalog.maxsep_snr_1h.loc[mask] ** 2) + + met = np.stack((obs, obs_char), axis=0) + met = met[:, np.argsort(met[0])] + met[1] = np.cumsum(met[1]) + met[0] -= self.data.catalog.t_slew.loc[mask] + met[1] += met[0] + met = met / np.arange(1, np.count_nonzero(mask) + 1, 1)[np.newaxis, :] + + return met + + def observe_star(self, + nstar, + int_time): + mask = self.data.catalog.nstar == nstar + + self.data.optm['tot_time'] += int_time + + slew_time = self.data.catalog.loc[mask, 't_slew'].iloc[0] + if not (slew_time == 0): + if (slew_time + int_time) < 0: + self.data.catalog.loc[mask, 't_slew'] += int_time + int_actual = 0 + else: + self.data.catalog.loc[mask, 't_slew'] = 0 + self.data.catalog.loc[mask, 'int_time'] += (slew_time + int_time) + int_actual = slew_time + int_time + else: + self.data.catalog.loc[mask, 'int_time'] += int_time + int_actual = int_time + + self.data.catalog.loc[mask, 'int_time_actual'] += int_actual + + self.data.catalog.loc[mask, 'snr_current'] = np.sqrt( + self.data.catalog.loc[mask, 'snr_current'] ** 2 + + (self.data.catalog.loc[mask, 'snr_1h'] + * np.sqrt(int_actual + / (60 * 60)))**2) + + # self.data.catalog.loc[mask, 'snr_char_current'] = np.sqrt( + # self.data.catalog.loc[mask, 'snr_char_current'] ** 2 + # + (self.data.catalog.loc[mask, 'maxsep_snr_1h'] + # * np.sqrt(int_char_time + # / (60 * 60))) ** 2) + + det_ids = [] + + for _, i in enumerate(np.where(mask)[0]): + if (not self.data.catalog.detected.iloc[i]) and \ + (self.data.catalog.snr_current.iloc[i] + >= self.data.options.optimization['snr_target']): + self.data.catalog.detected.iat[i] = True + det_ids.append(self.data.catalog.id.iat[i]) + if self.data.catalog.habitable.iloc[i]: + self.data.optm['sum_detected'][ + np.where(np.array(list(self.data.options.optimization['limit'].keys())) + == self.data.catalog.stype.iloc[i])] += 1 + self.id_series.append(det_ids) + + def distribute_time(self): + if not self.data.options.optimization['habitable']: + raise ValueError('Characterization optimization only implemented for habitable planets.') + stars, n = np.unique(ar=self.data.catalog.nstar, + return_counts=True) + obs = np.zeros((stars.shape[0], 2, np.max(n))) + np.inf + + # fill the observation time array + for i, nstar in enumerate(stars): + temp = self.obs_array_star(nstar=nstar) + obs[i, :, :temp.shape[1]] = temp + + obs_time = (self.data.options.optimization['t_search'] + * self.data.options.array['t_efficiency']) + + tot_time = 0 + if self.data.options.optimization['verbose']: + print('Number of planets detected by stellar type:') + + while tot_time < obs_time: + # find the best global slope and observe star + no_star, ind_t = np.unravel_index(np.argmin(obs[:, 1, :]), obs[:, 1, :].shape) + if (tot_time + obs[no_star, 0, ind_t] * (ind_t + 1) + 0.01) > obs_time: + rem_time = obs_time - tot_time + self.observe_star(nstar=stars[no_star], + int_time=rem_time) + tot_time += rem_time + + else: + if ((obs[no_star, 1, ind_t] - obs[no_star, 0, ind_t]) * (ind_t + 1) + 0.01) < 0: + raise ValueError('Negative time difference encountered.') + self.observe_star(nstar=stars[no_star], + int_time=obs[no_star, 0, ind_t] * (ind_t + 1) + 0.01) + temp = self.obs_array_star(nstar=stars[no_star]) + tot_time += obs[no_star, 0, ind_t] * (ind_t + 1) + 0.01 + obs[no_star, :, :] = np.inf + obs[no_star, :, :temp.shape[1]] = temp + + out_string = '' + for key in self.data.options.optimization['limit'].keys(): + temp_yield = (self.data.optm['sum_detected'] / self.data.optm['num_universe'])[ + np.where(np.array(list(self.data.options.optimization['limit'].keys())) + == key)][0] + out_string += (key + ': ' + + str(temp_yield) + + ' ') + self.yield_series[key].append(temp_yield) + + out_string += ('- (' + str(np.round(tot_time/60/60/24/365.25, 1)) + ' / ' + + str(np.round(obs_time/60/60/24/365.25, 1)) + ') yrs observed') + + self.t_series.append(tot_time) + + if self.data.options.optimization['verbose']: + print('\r' + out_string, end='') + + if np.any( + np.logical_and( + (self.data.optm['sum_detected'] / self.data.optm['num_universe']) + > np.array(list(self.data.options.optimization['limit'].values())), + np.invert(self.data.optm['hit_limit']))): + if self.data.options.optimization['verbose']: + print('\n') + print('HIT LIMIT, RECOUNTING -------------------') + self.data.optm['hit_limit'] = ((self.data.optm['sum_detected'] + / (self.data.optm['num_universe'])) + >= np.array( + list(self.data.options.optimization['limit'].values()) + )) + obs = np.zeros((stars.shape[0], 2, np.max(n))) + np.inf + + # fill the observation time array + for i, nstar in enumerate(stars): + temp = self.obs_array_star(nstar=nstar) + obs[i, :, :temp.shape[1]] = temp diff --git a/lifesim/optimize/optimizer.py b/lifesim/optimize/optimizer.py index 9a6cdfb..7bab8d6 100644 --- a/lifesim/optimize/optimizer.py +++ b/lifesim/optimize/optimizer.py @@ -81,12 +81,12 @@ def ahgs(self): rmax = np.inf for stype, limit in self.data.options.optimization['limit'].items(): self.data.options.optimization['experiments'][stype] = { - 'radius_p_min': rmin, - 'radius_p_max': rmax, - 'temp_s_min': self.data.catalog[self.data.catalog.stype==stype].temp_s.min(), - 'temp_s_max': self.data.catalog[self.data.catalog.stype==stype].temp_s.max(), + 'radius_p_min': float(rmin), + 'radius_p_max': float(rmax), + 'temp_s_min': float(self.data.catalog[self.data.catalog.stype==stype].temp_s.min()), + 'temp_s_max': float(self.data.catalog[self.data.catalog.stype==stype].temp_s.max()), 'in_HZ': self.data.options.optimization['habitable'], - 'sample_size': self.data.options.optimization['limit'][stype], + 'sample_size': float(self.data.options.optimization['limit'][stype]), } if self.data.options.optimization['experiments'] is None: @@ -98,6 +98,7 @@ def ahgs(self): self.data.catalog['is_interesting'] = False self.data.optm['hit_limit'] = {} self.data.optm['exp_detected'] = {} + self.data.optm['exp_detected_uni'] = {} for exp in self.data.options.optimization['experiments'].keys(): mask_exp = ((self.data.catalog.radius_p >= self.data.options.optimization['experiments'][exp]['radius_p_min']) @@ -118,7 +119,7 @@ def ahgs(self): self.data.optm['hit_limit'][exp] = False self.data.optm['exp_detected'][exp] = 0 - + print(self.data.optm) self.data.optm['tot_time'] = 0 # in sec # add new columns to catalog diff --git a/lifesim/util/options.py b/lifesim/util/options.py index 715c6d9..4b0a855 100644 --- a/lifesim/util/options.py +++ b/lifesim/util/options.py @@ -24,6 +24,8 @@ class Options(object): - ``'baseline'`` : Length of the shorter nulling baseline in [m]. - ``'bl_min'`` : Minimum allowed length of the shorter nulling baseline in [m]. - ``'bl_max'`` : Maximum allowed length of the shorter nulling baseline in [m]. + - ``'bl_discrete'``: Array of discrete baselines for use with "discrete" baseline + mode in [m]. - ``'ratio'`` : Ratio between the nulling and the imaging baseline. E.g. if the imaging baseline is twice as long as the nulling baseline, the ratio will be 2. - ``'t_slew'`` : Slew time required for the array to shift from one target system to @@ -44,6 +46,12 @@ class Options(object): ``'darwinsim'`` - ``'habitable'`` : Model used for calculating the habitable zone, possible options are ``'MS'`` and ``'POST_MS'`` + - ``'fov_taper'`` : Model used for tapering the field of view, possible options are + ``'gaussian'`` and ``'none'`` + - ``'baseline_optimisation'`` : Model used for optimising the baseline, possible options are + ``'ref_wl'``, ``'Kepler'``, ``'Uniform'``. + - ``'discrete_baselines'``: Boolean as to whether to use discrete baselines in the simulation + given by 'bl_discrete'. optimization : dict Options concerning the methods used to optimally distribute the observing time. - ``'N_pf'`` : Number of sampling locations per orbit. @@ -67,6 +75,7 @@ def __init__(self): 'baseline': 0., 'bl_min': 0., 'bl_max': 0., + 'bl_discrete': [0.], 'ratio': 0., 't_slew': 0., 't_efficiency': 0.} @@ -76,16 +85,24 @@ def __init__(self): 'n_plugins': 0, 'output_path': None, 'output_filename': None, - 'fov_threshold': 0.} + 'fov_threshold': 0., + 'n_cpu': 1} self.models = {'localzodi': '', 'habitable': '', - 'fov_taper': ''} + 'fov_taper': '', + 'baseline_optimisation': '', + 'discrete_baselines': False} self.optimization = {'N_pf': 0., 'snr_target': 0., 'experiments': None, - 't_search': 0.} + 't_search': 0., + 'characterization': False, + 'snr_char': 0., + 'opt_limit': 'time', + 'opt_limit_factor': 0.5, + 'n_orbits': 1} def set_scenario(self, case: str): @@ -117,11 +134,14 @@ def set_scenario(self, self.models['localzodi'] = 'darwinsim' self.models['habitable'] = 'MS' self.models['fov_taper'] = 'gaussian' + self.models['baseline_optimisation'] = 'ref_wl' + self.models['discrete_baselines'] = False self.optimization['N_pf'] = 25 self.optimization['snr_target'] = 7 - self.optimization['limit'] = {'A': np.inf, - 'F': np.inf, + self.optimization['snr_char'] = 44.1 + self.optimization['n_orbits'] = 5 + self.optimization['limit'] = {'F': np.inf, 'G': np.inf, 'K': np.inf, 'M': np.inf} diff --git a/setup.py b/setup.py index 6f0e885..3c2c276 100644 --- a/setup.py +++ b/setup.py @@ -36,14 +36,16 @@ def get_version(rel_path: str) -> str: 'matplotlib>=3.7.0', 'numpy>=1.24.2', 'pandas>=1.5.3', - 'PyQt5==5.15.4', + 'PyQt5>=5.15.4', 'tqdm>=4.64.1', - 'tables>=3.8.0', - 'GitPython>=3.1.32'], + 'GitPython>=3.1.32', + 'joblib>=1.5.2', + 'joblib-progress>=1.0.6', + 'tables>=3.10.2'], license='GPLv3', zip_safe=False, keywords='lifesim', - python_requires='~=3.8', + python_requires='~=3.12', classifiers=[ 'Development Status :: 3 - Alpha', 'Intended Audience :: Science/Research',