-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
38 lines (30 loc) · 1.26 KB
/
main.py
File metadata and controls
38 lines (30 loc) · 1.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from pathlib import Path
import pandas as pd
from encoder import EncoderConfig, TopologyBiasedGlycanEncoder
def main() -> None:
title = "Glyde: A domain-aware, topology-biased glycan language model for viral glycan binding prediction"
thesis = "Viral glycan binding is a high-impact, tractable domain for a structure-aware glycan language model."
print("Research Proposal")
print(title)
print()
print("One-sentence thesis:")
print(thesis)
print()
tokenized_path = Path("artifacts/glycans_tokenized.parquet")
if not tokenized_path.exists():
print("Encoder status:")
print(f"- tokenized artifact missing at {tokenized_path}")
return
tokenized_df = pd.read_parquet(tokenized_path)
encoder = TopologyBiasedGlycanEncoder.from_tokenized_dataframe(
tokenized_df,
config=EncoderConfig(embedding_dim=64, random_seed=13),
)
encoded_df = encoder.encode_dataframe(tokenized_df, parseable_only=True)
print("Encoder status:")
print(f"- tokenized glycans: {len(tokenized_df)}")
print(f"- encoded glycans: {len(encoded_df)}")
print(f"- encoder dim: {encoder.config.embedding_dim}")
print(f"- encoder vocab size: {len(encoder.token_to_id)}")
if __name__ == "__main__":
main()