Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -307,3 +307,6 @@ hs_err_pid*
# Project specific

/.idea/
/data/metadata.ipac
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to track these data files in git.

/data/metadata.ipac.tar.gz
/data/metadata.feather
11 changes: 2 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,10 @@ python lightcurve_downloader.py
```
---
### To locally download ALL MOA9yr lightcurves from the NExSci archive [can be improved]:
* This will take ~ 55 days... For now, I recommend breaking this function in 15 pieces and run parallel, e.g. piece #2:
```
metadata = Metadata()
n = 2
df_total = metadata.dataframe
df_temp = df_total[df_total['ROW_NUM'] > (n-1)*160604]
df = df_temp[df_temp['ROW_NUM'] <= n*160604]
```
* In `all_lightcurves_downloader.py`:
* You can change the path by changing the variable `path_to_save_ ='[the_path_you_want_]/'`
* You can also change the extension `lightcurve_extension_='.[extension]'`. Only `feather` and `CSV` supported for now.
* You can also change the extension `lightcurve_extension_='.[extension]'`. Only `feather` and `CSV` supported for now.
* Single threaded, this will take ~55 days. The script parallelizes this across 15 processes, and this is adjustable.

After done that, you just have to run the following command in the terminal:
```
Expand Down
22 changes: 17 additions & 5 deletions all_lightcurves_downloader.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,22 @@
from multiprocessing import Pool

from lightcurve_downloader import download_lightcurve
from merida.lightcurves_cls import Metadata
from tqdm import tqdm

metadata = Metadata()
df = metadata.dataframe

for lightcurve_name in tqdm(df['lightcurve_name']):
print('\n', lightcurve_name, '\n')
download_lightcurve(lightcurve_name, '', path_to_save_='data/microlensing_2M/', lightcurve_extension_='.feather')
def process_light_curve_name(light_curve_name):
print('\n', light_curve_name, '\n')
download_lightcurve(light_curve_name, '', path_to_save_='data/microlensing_2M/', lightcurve_extension_='.feather')


def download_all_light_curves():
metadata = Metadata()
df = metadata.dataframe
with Pool(15) as pool:
for _ in tqdm(pool.imap_unordered(process_light_curve_name, df['lightcurve_name'])):
pass


if __name__ == '__main__':
download_all_light_curves()
Binary file removed data/metadata_1of2.feather
Binary file not shown.
Binary file removed data/metadata_2of2.feather
Binary file not shown.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ dependencies = [
"tqdm>=4.66.3",
"pytest>=7.1.3",
"pytest-pycharm>=0.7.0",
"astropy>=6.0.0",
Copy link
Contributor Author

@golmschenk golmschenk Apr 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added astropy because it includes a reader for the ipac data format. This makes it easy to convert that format to Pandas, and from there to .feather.

]

[project.urls]
Expand Down
48 changes: 42 additions & 6 deletions src/merida/lightcurves_cls.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import re
import shutil
import urllib.request
from pathlib import Path

import astropy.io.ascii
import pandas as pd
import numpy as np
import requests
Expand Down Expand Up @@ -215,16 +219,48 @@ class Metadata:
'lightcurve_name']
"""

def __init__(self):
self.path1 = 'data/metadata_1of2.feather'
self.path2 = 'data/metadata_2of2.feather'
self.dataframe1 = pd.read_feather(self.path1)
self.dataframe2 = pd.read_feather(self.path2)
self.dataframe = pd.concat([self.dataframe1, self.dataframe2], ignore_index=True)
def __init__(self, path_to_metadata: Path = Path('data/metadata.feather')):
if not path_to_metadata.exists():
print(f'Metadata not found at `{path_to_metadata}`. Downloading and converting...')
self.download_metadata(path_to_metadata)
print(f'Metadata download and conversion complete.')
self.dataframe = pd.read_feather(path_to_metadata)

def get_one_metadata_csv_file(self):
self.dataframe.to_csv('data/metadata_test.csv', index=False)

@staticmethod
def download_metadata(path_to_metadata: Path = Path('data/metadata.feather')) -> None:
"""
Downloads the metadata for the MOA 9-year dataset.

:param path_to_metadata: The path to the metadata.
"""
metadata_url = 'https://exoplanetarchive.ipac.caltech.edu/data/Contributed/MOA/bulk/metadata.ipac.tar.gz'
data_directory = path_to_metadata.parent
data_directory.mkdir(parents=True, exist_ok=True)
gz_path = data_directory.joinpath('metadata.ipac.tar.gz')
ipac_path = data_directory.joinpath('metadata.ipac')
if gz_path.exists():
gz_path.unlink()
if ipac_path.exists():
ipac_path.unlink()
if path_to_metadata.exists():
path_to_metadata.unlink()
with urllib.request.urlopen(metadata_url) as response, gz_path.open('wb') as gz_file:
shutil.copyfileobj(response, gz_file)
shutil.unpack_archive(gz_path, data_directory)
astropy_table = astropy.io.ascii.read(ipac_path)
data_frame = astropy_table.to_pandas()

def metadata_row_to_light_curve_name(metadata_row: pd.Series) -> str:
return f'gb{metadata_row["field"]}-R-{metadata_row["chip"]}-{metadata_row["subframe"]}-{metadata_row["id"]}'

data_frame['lightcurve_name'] = data_frame.apply(metadata_row_to_light_curve_name, axis=1)
data_frame.to_feather(path_to_metadata)
gz_path.unlink()
ipac_path.unlink()


class MetadataLocal:
"""
Expand Down
2 changes: 0 additions & 2 deletions src/merida/metadata_cls.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,6 @@ def __init__(self, internal_lightcurve_name, *,
self.alert_id2 = self.metadata['alert_id2'].values[0]
self.alert_x2 = self.metadata['alert_x2'].values[0]
self.alert_y2 = self.metadata['alert_y2'].values[0]
self.ROW_IDX = self.metadata['ROW_IDX'].values[0]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These seemed to be metadata rows that don't exist in the IPAC version and just seemed to increment with the light curves. I'm guessing these were an artifact that was adding during the original metadata file creation? I don't believe they are needed.

self.ROW_NUM = self.metadata['ROW_NUM'].values[0]
self.lightcurve_name = self.metadata['lightcurve_name'].values[0]


Expand Down