stelais · stelais · Apr 28, 2025 · Apr 27, 2025 · Apr 27, 2025 · golmschenk
diff --git a/.gitignore b/.gitignore
@@ -307,3 +307,6 @@ hs_err_pid*
 # Project specific
 
 /.idea/
+/data/metadata.ipac
+/data/metadata.ipac.tar.gz
+/data/metadata.feather
diff --git a/README.md b/README.md
@@ -34,17 +34,10 @@ python lightcurve_downloader.py
 ```
 ---
 ### To locally download ALL MOA9yr lightcurves from the NExSci archive [can be improved]:
-* This will take ~ 55 days... For now, I recommend breaking this function in 15 pieces and run parallel, e.g. piece #2:
-  ``` 
-  metadata = Metadata()
-  n = 2
-  df_total = metadata.dataframe
-  df_temp = df_total[df_total['ROW_NUM'] > (n-1)*160604]
-  df = df_temp[df_temp['ROW_NUM'] <= n*160604]
-  ```
 * In `all_lightcurves_downloader.py`:
   * You can change the path by changing the variable `path_to_save_ ='[the_path_you_want_]/'`
-  * You can also change the extension `lightcurve_extension_='.[extension]'`. Only `feather` and `CSV` supported for now. 
+  * You can also change the extension `lightcurve_extension_='.[extension]'`. Only `feather` and `CSV` supported for now.
+  * Single threaded, this will take ~55 days. The script parallelizes this across 15 processes, and this is adjustable. 
 
 After done that, you just have to run the following command in the terminal:
 ```

diff --git a/all_lightcurves_downloader.py b/all_lightcurves_downloader.py
@@ -1,10 +1,22 @@
+from multiprocessing import Pool
+
 from lightcurve_downloader import download_lightcurve
 from merida.lightcurves_cls import Metadata
 from tqdm import tqdm
 
-metadata = Metadata()
-df = metadata.dataframe
 
-for lightcurve_name in tqdm(df['lightcurve_name']):
-    print('\n', lightcurve_name, '\n')
-    download_lightcurve(lightcurve_name, '', path_to_save_='data/microlensing_2M/', lightcurve_extension_='.feather')
+def process_light_curve_name(light_curve_name):
+    print('\n', light_curve_name, '\n')
+    download_lightcurve(light_curve_name, '', path_to_save_='data/microlensing_2M/', lightcurve_extension_='.feather')
+
+
+def download_all_light_curves():
+    metadata = Metadata()
+    df = metadata.dataframe
+    with Pool(15) as pool:
+        for _ in tqdm(pool.imap_unordered(process_light_curve_name, df['lightcurve_name'])):
+            pass
+
+
+if __name__ == '__main__':
+    download_all_light_curves()
diff --git a/data/metadata_1of2.feather b/data/metadata_1of2.feather
diff --git a/data/metadata_2of2.feather b/data/metadata_2of2.feather
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,6 +22,7 @@ dependencies = [
     "tqdm>=4.66.3",
     "pytest>=7.1.3",
     "pytest-pycharm>=0.7.0",
+    "astropy>=6.0.0",
 ]
 
 [project.urls]

diff --git a/src/merida/lightcurves_cls.py b/src/merida/lightcurves_cls.py
@@ -1,5 +1,9 @@
 import re
+import shutil
+import urllib.request
+from pathlib import Path
 
+import astropy.io.ascii
 import pandas as pd
 import numpy as np
 import requests
@@ -215,16 +219,48 @@ class Metadata:
     'lightcurve_name']
     """
 
-    def __init__(self):
-        self.path1 = 'data/metadata_1of2.feather'
-        self.path2 = 'data/metadata_2of2.feather'
-        self.dataframe1 = pd.read_feather(self.path1)
-        self.dataframe2 = pd.read_feather(self.path2)
-        self.dataframe = pd.concat([self.dataframe1, self.dataframe2], ignore_index=True)
+    def __init__(self, path_to_metadata: Path = Path('data/metadata.feather')):
+        if not path_to_metadata.exists():
+            print(f'Metadata not found at `{path_to_metadata}`. Downloading and converting...')
+            self.download_metadata(path_to_metadata)
+            print(f'Metadata download and conversion complete.')
+        self.dataframe = pd.read_feather(path_to_metadata)
 
     def get_one_metadata_csv_file(self):
         self.dataframe.to_csv('data/metadata_test.csv', index=False)
 
+    @staticmethod
+    def download_metadata(path_to_metadata: Path = Path('data/metadata.feather')) -> None:
+        """
+        Downloads the metadata for the MOA 9-year dataset.
+
+        :param path_to_metadata: The path to the metadata.
+        """
+        metadata_url = 'https://exoplanetarchive.ipac.caltech.edu/data/Contributed/MOA/bulk/metadata.ipac.tar.gz'
+        data_directory = path_to_metadata.parent
+        data_directory.mkdir(parents=True, exist_ok=True)
+        gz_path = data_directory.joinpath('metadata.ipac.tar.gz')
+        ipac_path = data_directory.joinpath('metadata.ipac')
+        if gz_path.exists():
+            gz_path.unlink()
+        if ipac_path.exists():
+            ipac_path.unlink()
+        if path_to_metadata.exists():
+            path_to_metadata.unlink()
+        with urllib.request.urlopen(metadata_url) as response, gz_path.open('wb') as gz_file:
+            shutil.copyfileobj(response, gz_file)
+        shutil.unpack_archive(gz_path, data_directory)
+        astropy_table = astropy.io.ascii.read(ipac_path)
+        data_frame = astropy_table.to_pandas()
+
+        def metadata_row_to_light_curve_name(metadata_row: pd.Series) -> str:
+            return f'gb{metadata_row["field"]}-R-{metadata_row["chip"]}-{metadata_row["subframe"]}-{metadata_row["id"]}'
+
+        data_frame['lightcurve_name'] = data_frame.apply(metadata_row_to_light_curve_name, axis=1)
+        data_frame.to_feather(path_to_metadata)
+        gz_path.unlink()
+        ipac_path.unlink()
+
 
 class MetadataLocal:
     """

diff --git a/src/merida/metadata_cls.py b/src/merida/metadata_cls.py
@@ -106,8 +106,6 @@ def __init__(self, internal_lightcurve_name, *,
         self.alert_id2 = self.metadata['alert_id2'].values[0]
         self.alert_x2 = self.metadata['alert_x2'].values[0]
         self.alert_y2 = self.metadata['alert_y2'].values[0]
-        self.ROW_IDX = self.metadata['ROW_IDX'].values[0]
-        self.ROW_NUM = self.metadata['ROW_NUM'].values[0]
         self.lightcurve_name = self.metadata['lightcurve_name'].values[0]