diff --git a/.gitignore b/.gitignore index b909629..c0f95b3 100644 --- a/.gitignore +++ b/.gitignore @@ -307,3 +307,6 @@ hs_err_pid* # Project specific /.idea/ +/data/metadata.ipac +/data/metadata.ipac.tar.gz +/data/metadata.feather diff --git a/README.md b/README.md index 286b09a..66c610e 100644 --- a/README.md +++ b/README.md @@ -34,17 +34,10 @@ python lightcurve_downloader.py ``` --- ### To locally download ALL MOA9yr lightcurves from the NExSci archive [can be improved]: -* This will take ~ 55 days... For now, I recommend breaking this function in 15 pieces and run parallel, e.g. piece #2: - ``` - metadata = Metadata() - n = 2 - df_total = metadata.dataframe - df_temp = df_total[df_total['ROW_NUM'] > (n-1)*160604] - df = df_temp[df_temp['ROW_NUM'] <= n*160604] - ``` * In `all_lightcurves_downloader.py`: * You can change the path by changing the variable `path_to_save_ ='[the_path_you_want_]/'` - * You can also change the extension `lightcurve_extension_='.[extension]'`. Only `feather` and `CSV` supported for now. + * You can also change the extension `lightcurve_extension_='.[extension]'`. Only `feather` and `CSV` supported for now. + * Single threaded, this will take ~55 days. The script parallelizes this across 15 processes, and this is adjustable. After done that, you just have to run the following command in the terminal: ``` diff --git a/all_lightcurves_downloader.py b/all_lightcurves_downloader.py index 40772fc..682ba2d 100644 --- a/all_lightcurves_downloader.py +++ b/all_lightcurves_downloader.py @@ -1,10 +1,22 @@ +from multiprocessing import Pool + from lightcurve_downloader import download_lightcurve from merida.lightcurves_cls import Metadata from tqdm import tqdm -metadata = Metadata() -df = metadata.dataframe -for lightcurve_name in tqdm(df['lightcurve_name']): - print('\n', lightcurve_name, '\n') - download_lightcurve(lightcurve_name, '', path_to_save_='data/microlensing_2M/', lightcurve_extension_='.feather') +def process_light_curve_name(light_curve_name): + print('\n', light_curve_name, '\n') + download_lightcurve(light_curve_name, '', path_to_save_='data/microlensing_2M/', lightcurve_extension_='.feather') + + +def download_all_light_curves(): + metadata = Metadata() + df = metadata.dataframe + with Pool(15) as pool: + for _ in tqdm(pool.imap_unordered(process_light_curve_name, df['lightcurve_name'])): + pass + + +if __name__ == '__main__': + download_all_light_curves() diff --git a/data/metadata_1of2.feather b/data/metadata_1of2.feather deleted file mode 100644 index 8dbddc2..0000000 Binary files a/data/metadata_1of2.feather and /dev/null differ diff --git a/data/metadata_2of2.feather b/data/metadata_2of2.feather deleted file mode 100644 index 85d4c7e..0000000 Binary files a/data/metadata_2of2.feather and /dev/null differ diff --git a/pyproject.toml b/pyproject.toml index f0c4350..37d81ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ dependencies = [ "tqdm>=4.66.3", "pytest>=7.1.3", "pytest-pycharm>=0.7.0", + "astropy>=6.0.0", ] [project.urls] diff --git a/src/merida/lightcurves_cls.py b/src/merida/lightcurves_cls.py index 6b1cdfa..6fc4266 100644 --- a/src/merida/lightcurves_cls.py +++ b/src/merida/lightcurves_cls.py @@ -1,5 +1,9 @@ import re +import shutil +import urllib.request +from pathlib import Path +import astropy.io.ascii import pandas as pd import numpy as np import requests @@ -215,16 +219,48 @@ class Metadata: 'lightcurve_name'] """ - def __init__(self): - self.path1 = 'data/metadata_1of2.feather' - self.path2 = 'data/metadata_2of2.feather' - self.dataframe1 = pd.read_feather(self.path1) - self.dataframe2 = pd.read_feather(self.path2) - self.dataframe = pd.concat([self.dataframe1, self.dataframe2], ignore_index=True) + def __init__(self, path_to_metadata: Path = Path('data/metadata.feather')): + if not path_to_metadata.exists(): + print(f'Metadata not found at `{path_to_metadata}`. Downloading and converting...') + self.download_metadata(path_to_metadata) + print(f'Metadata download and conversion complete.') + self.dataframe = pd.read_feather(path_to_metadata) def get_one_metadata_csv_file(self): self.dataframe.to_csv('data/metadata_test.csv', index=False) + @staticmethod + def download_metadata(path_to_metadata: Path = Path('data/metadata.feather')) -> None: + """ + Downloads the metadata for the MOA 9-year dataset. + + :param path_to_metadata: The path to the metadata. + """ + metadata_url = 'https://exoplanetarchive.ipac.caltech.edu/data/Contributed/MOA/bulk/metadata.ipac.tar.gz' + data_directory = path_to_metadata.parent + data_directory.mkdir(parents=True, exist_ok=True) + gz_path = data_directory.joinpath('metadata.ipac.tar.gz') + ipac_path = data_directory.joinpath('metadata.ipac') + if gz_path.exists(): + gz_path.unlink() + if ipac_path.exists(): + ipac_path.unlink() + if path_to_metadata.exists(): + path_to_metadata.unlink() + with urllib.request.urlopen(metadata_url) as response, gz_path.open('wb') as gz_file: + shutil.copyfileobj(response, gz_file) + shutil.unpack_archive(gz_path, data_directory) + astropy_table = astropy.io.ascii.read(ipac_path) + data_frame = astropy_table.to_pandas() + + def metadata_row_to_light_curve_name(metadata_row: pd.Series) -> str: + return f'gb{metadata_row["field"]}-R-{metadata_row["chip"]}-{metadata_row["subframe"]}-{metadata_row["id"]}' + + data_frame['lightcurve_name'] = data_frame.apply(metadata_row_to_light_curve_name, axis=1) + data_frame.to_feather(path_to_metadata) + gz_path.unlink() + ipac_path.unlink() + class MetadataLocal: """ diff --git a/src/merida/metadata_cls.py b/src/merida/metadata_cls.py index 9a20dc3..19648a1 100644 --- a/src/merida/metadata_cls.py +++ b/src/merida/metadata_cls.py @@ -106,8 +106,6 @@ def __init__(self, internal_lightcurve_name, *, self.alert_id2 = self.metadata['alert_id2'].values[0] self.alert_x2 = self.metadata['alert_x2'].values[0] self.alert_y2 = self.metadata['alert_y2'].values[0] - self.ROW_IDX = self.metadata['ROW_IDX'].values[0] - self.ROW_NUM = self.metadata['ROW_NUM'].values[0] self.lightcurve_name = self.metadata['lightcurve_name'].values[0]