Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -657,6 +657,17 @@ def cmd_stats(args: argparse.Namespace) -> None:
table.add_row(prog_id, str(acc), str(rej), str(wl), str(total), rate)

console.print(table)

# Gender & nationality summary
gender = summary.get("gender_dist", {})
nat = summary.get("nationality_dist", {})
if gender or nat:
gender_str = f"M:{gender.get('M', 0)} F:{gender.get('F', 0)}"
nat_parts = [f"{k}:{v}" for k, v in sorted(nat.items(), key=lambda x: -x[1])]
console.print(
f" [bold]Demographics:[/bold] Gender: {gender_str} | "
f"Nationality: {', '.join(nat_parts)}"
)
console.print()

# GPA distribution
Expand Down
111 changes: 109 additions & 2 deletions core/admission_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@

CSV schema
----------
id, bg_type, gpa, gpa_scale, gre, toefl, major, intern_desc,
has_paper, has_research, courses_note, program, result, season, source
id, gender, bg_type, nationality, gpa, gpa_scale, gre, toefl, major,
intern_desc, has_paper, has_research, courses_note, program, result,
season, source
"""

from __future__ import annotations
Expand Down Expand Up @@ -165,6 +166,60 @@ def classify_background(bg_type: str) -> int:
return 4 # default


# ---------------------------------------------------------------------------
# Nationality classification
# ---------------------------------------------------------------------------

# Canonical nationality values
NATIONALITY_DOMESTIC = "domestic" # US citizen / permanent resident
NATIONALITY_CHINA = "china" # Chinese mainland
NATIONALITY_HK_TW = "hk_tw" # Hong Kong, Macau, Taiwan
NATIONALITY_OTHER_INTL = "other_intl" # Other international

_NATIONALITY_MAP: dict[str, str] = {
"美籍": NATIONALITY_DOMESTIC,
"美国": NATIONALITY_DOMESTIC,
"us": NATIONALITY_DOMESTIC,
"domestic": NATIONALITY_DOMESTIC,
"greencard": NATIONALITY_DOMESTIC,
"绿卡": NATIONALITY_DOMESTIC,
"pr": NATIONALITY_DOMESTIC,
"中国大陆": NATIONALITY_CHINA,
"中国": NATIONALITY_CHINA,
"大陆": NATIONALITY_CHINA,
"china": NATIONALITY_CHINA,
"mainland": NATIONALITY_CHINA,
"港澳台": NATIONALITY_HK_TW,
"香港": NATIONALITY_HK_TW,
"台湾": NATIONALITY_HK_TW,
"澳门": NATIONALITY_HK_TW,
"hk": NATIONALITY_HK_TW,
"taiwan": NATIONALITY_HK_TW,
}


def classify_nationality(nationality: str) -> str:
"""Map a nationality string to a canonical value.

Returns one of: 'domestic', 'china', 'hk_tw', 'other_intl'.
Empty/unknown values return 'china' (most common in MFE applicant pool).
"""
val = nationality.strip().lower().replace(" ", "")
if not val or val in ("不明", "n/a", "unknown"):
return NATIONALITY_CHINA # default for MFE applicant pool

# Exact match
if val in _NATIONALITY_MAP:
return _NATIONALITY_MAP[val]

# Partial match
for key, canonical in _NATIONALITY_MAP.items():
if key in val or val in key:
return canonical

return NATIONALITY_OTHER_INTL


# ---------------------------------------------------------------------------
# Intern strength scoring
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -230,8 +285,11 @@ class AdmissionRecord:
"""A single real applicant data point with normalized fields."""

id: str = ""
gender: str = "" # M / F / empty
bg_type: str = ""
bg_tier: int = 4 # 1-5, computed from bg_type
nationality: str = "" # raw value
nationality_canonical: str = "" # domestic / china / hk_tw / other_intl
gpa_raw: float = 0.0
gpa_scale: float = 4.0
gpa_normalized: float = 0.0 # on 4.0 scale
Expand Down Expand Up @@ -266,6 +324,8 @@ class ProgramStats:
avg_intern_score_accepted: float = 0.0
paper_rate_accepted: float = 0.0
research_rate_accepted: float = 0.0
female_rate_accepted: float = 0.0 # fraction of female among accepted
nationality_dist_accepted: dict[str, int] = field(default_factory=dict)

# Rejected applicant stats
avg_gpa_rejected: float = 0.0
Expand Down Expand Up @@ -348,11 +408,15 @@ def load_admission_csv(path: str | Path) -> list[AdmissionRecord]:
gpa_scale = 4.0

bg_type = row.get("bg_type", "").strip()
nationality_raw = row.get("nationality", "").strip()

rec = AdmissionRecord(
id=row.get("id", "").strip(),
gender=row.get("gender", "").strip().upper(),
bg_type=bg_type,
bg_tier=classify_background(bg_type),
nationality=nationality_raw,
nationality_canonical=classify_nationality(nationality_raw),
gpa_raw=gpa_raw,
gpa_scale=gpa_scale,
gpa_normalized=normalize_gpa(gpa_raw, gpa_scale),
Expand Down Expand Up @@ -454,6 +518,19 @@ def compute_program_stats(
if research_known
else 0.0
)
# Gender stats
gendered = [r for r in accepted if r.gender in ("M", "F")]
stats.female_rate_accepted = (
sum(1 for r in gendered if r.gender == "F") / len(gendered)
if gendered
else 0.0
)
# Nationality distribution
nat_dist: dict[str, int] = {}
for r in accepted:
nat = r.nationality_canonical or "unknown"
nat_dist[nat] = nat_dist.get(nat, 0) + 1
stats.nationality_dist_accepted = nat_dist

# Rejected stats
if rejected:
Expand Down Expand Up @@ -530,6 +607,22 @@ def _effect_size(acc_vals: list[float], rej_vals: list[float]) -> float:
[1.0 if r.has_research else 0.0 for r in rejected if r.has_research is not None],
)

# Gender (female = 1, male = 0)
acc_gender = [1.0 if r.gender == "F" else 0.0 for r in accepted if r.gender in ("M", "F")]
rej_gender = [1.0 if r.gender == "F" else 0.0 for r in rejected if r.gender in ("M", "F")]
features["gender_f"] = _effect_size(acc_gender, rej_gender)

# Nationality (domestic = 1, international = 0)
acc_nat = [
1.0 if r.nationality_canonical == "domestic" else 0.0
for r in accepted if r.nationality_canonical
]
rej_nat = [
1.0 if r.nationality_canonical == "domestic" else 0.0
for r in rejected if r.nationality_canonical
]
features["domestic"] = _effect_size(acc_nat, rej_nat)

return features


Expand Down Expand Up @@ -564,6 +657,18 @@ def summarize_records(records: list[AdmissionRecord]) -> dict[str, Any]:
seasons = sorted({r.season for r in records if r.season})
sources = sorted({r.source for r in records if r.source})

# Gender breakdown
gendered = [r for r in records if r.gender in ("M", "F")]
gender_dist = {"M": 0, "F": 0}
for r in gendered:
gender_dist[r.gender] += 1

# Nationality breakdown
nat_dist: dict[str, int] = {}
for r in records:
nat = r.nationality_canonical or "unknown"
nat_dist[nat] = nat_dist.get(nat, 0) + 1

return {
"total_records": len(records),
"unique_applicants": len({r.id for r in records}),
Expand All @@ -572,4 +677,6 @@ def summarize_records(records: list[AdmissionRecord]) -> dict[str, Any]:
"sources": sources,
"avg_gpa_normalized": _safe_avg([r.gpa_normalized for r in records]),
"gre_available": sum(1 for r in records if r.gre is not None),
"gender_dist": gender_dist,
"nationality_dist": nat_dist,
}
39 changes: 31 additions & 8 deletions core/calibrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,28 +209,28 @@ def predict_outcome(
score = 0.0
max_score = 0.0

# GPA component (40%)
weight_gpa = 0.4
# GPA component (35%)
weight_gpa = 0.35
max_score += weight_gpa
if threshold.gpa_target > 0:
gpa_ratio = record.gpa_normalized / threshold.gpa_target
score += weight_gpa * min(1.0, gpa_ratio)

# Background tier (25%)
weight_bg = 0.25
# Background tier (20%)
weight_bg = 0.20
max_score += weight_bg
if threshold.max_bg_tier_accepted > 0:
bg_ratio = 1.0 - (record.bg_tier - 1) / 4.0 # tier 1=1.0, tier 5=0.0
score += weight_bg * max(0.0, bg_ratio)

# Intern score (20%)
weight_intern = 0.2
# Intern score (18%)
weight_intern = 0.18
max_score += weight_intern
if record.intern_score > 0:
score += weight_intern * min(1.0, record.intern_score / 8.0)

# Research/paper bonus (15%)
weight_research = 0.15
# Research/paper bonus (12%)
weight_research = 0.12
max_score += weight_research
bonus = 0.0
if record.has_paper:
Expand All @@ -239,6 +239,29 @@ def predict_outcome(
bonus += 0.5
score += weight_research * bonus

# Gender diversity bonus (7%)
# MFE programs skew heavily male; female applicants may benefit
weight_gender = 0.07
max_score += weight_gender
if record.gender == "F":
score += weight_gender * 1.0
elif record.gender == "M":
score += weight_gender * 0.4 # baseline, no penalty

# Nationality / domestic advantage (8%)
# Domestic applicants (US citizens/PR) have slight advantage
weight_nat = 0.08
max_score += weight_nat
nat = record.nationality_canonical
if nat == "domestic":
score += weight_nat * 1.0
elif nat == "hk_tw":
score += weight_nat * 0.6
elif nat == "china":
score += weight_nat * 0.4 # largest applicant pool, most competitive
else:
score += weight_nat * 0.5

# Classify based on score ratio
ratio = score / max_score if max_score > 0 else 0.0

Expand Down
62 changes: 31 additions & 31 deletions data/admissions/sample.csv
Original file line number Diff line number Diff line change
@@ -1,31 +1,31 @@
id,bg_type,gpa,gpa_scale,gre,toefl,major,intern_desc,has_paper,has_research,courses_note,program,result,season,source
1,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,baruch-mfe,accepted,2025Fall,quantnet
2,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,princeton-mfin,rejected,2025Fall,quantnet
3,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,cmu-mscf,accepted,2025Fall,quantnet
4,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,mit-mfin,rejected,2025Fall,quantnet
5,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,uchicago-msfm,accepted,2025Fall,quantnet
6,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,gatech-qcf,accepted,2025Fall,quantnet
7,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,nus-qf,accepted,2025Fall,quantnet
8,985,3.8,4,332,112,数学,2段量化实习+1段券商研究,是,是,实分析+随机过程+C++,baruch-mfe,accepted,2025Fall,chasedream
9,985,3.8,4,332,112,数学,2段量化实习+1段券商研究,是,是,实分析+随机过程+C++,princeton-mfin,rejected,2025Fall,chasedream
10,985,3.8,4,332,112,数学,2段量化实习+1段券商研究,是,是,实分析+随机过程+C++,cmu-mscf,accepted,2025Fall,chasedream
11,海本(Top30),3.92,4,335,,,2段顶级量化+1段投行,是,否,数学+CS双专业,baruch-mfe,accepted,2025Fall,linkedin
12,海本(Top30),3.92,4,335,,,2段顶级量化+1段投行,是,否,数学+CS双专业,princeton-mfin,accepted,2025Fall,linkedin
13,海本(Top30),3.92,4,335,,,2段顶级量化+1段投行,是,否,数学+CS双专业,cmu-mscf,accepted,2025Fall,linkedin
14,985,3.5,4,325,105,金融,1段银行实习,否,否,,columbia-mafn,rejected,2025Fall,chasedream
15,985,3.5,4,325,105,金融,1段银行实习,否,否,,gatech-qcf,accepted,2025Fall,chasedream
16,211,3.6,4,328,108,统计,2段数据分析实习,否,是,时间序列+回归分析,nyu-mfe,rejected,2025Fall,quantnet
17,211,3.6,4,328,108,统计,2段数据分析实习,否,是,时间序列+回归分析,rutgers-msmf,accepted,2025Fall,quantnet
18,211,3.6,4,328,108,统计,2段数据分析实习,否,是,时间序列+回归分析,fordham-msqf,accepted,2025Fall,quantnet
19,985,87,100,330,115,金工,3段量化+1段投行,是,是,随机微积分+实分析+ML,baruch-mfe,accepted,2025Fall,offershow
20,985,87,100,330,115,金工,3段量化+1段投行,是,是,随机微积分+实分析+ML,nyu-mfe,accepted,2025Fall,offershow
21,海本(Top50),3.7,4,329,,,1段量化实习+1段fintech,否,否,CS+数学辅修,columbia-mafn,accepted,2025Fall,offershow
22,海本(Top50),3.7,4,329,,,1段量化实习+1段fintech,否,否,CS+数学辅修,mit-mfin,rejected,2025Fall,offershow
23,双非一本,3.8,4,326,102,应用数学,1段量化实习,否,否,概率论+线代+微积分,rutgers-msmf,accepted,2025Fall,chasedream
24,双非一本,3.8,4,326,102,应用数学,1段量化实习,否,否,概率论+线代+微积分,baruch-mfe,rejected,2025Fall,chasedream
25,985,3.9,4,333,118,计算机,3段量化实习+kaggle金牌,是,是,ML+深度学习+C++,baruch-mfe,accepted,2025Fall,quantnet
26,985,3.9,4,333,118,计算机,3段量化实习+kaggle金牌,是,是,ML+深度学习+C++,cmu-mscf,accepted,2025Fall,quantnet
27,985,3.9,4,333,118,计算机,3段量化实习+kaggle金牌,是,是,ML+深度学习+C++,princeton-mfin,waitlisted,2025Fall,quantnet
28,海本(Top10),3.95,4.3,337,,,数学+金融双专业,2段顶级投行+1段对冲基金,是,是,实分析+泛函+随机微积分,princeton-mfin,accepted,2025Fall,linkedin
29,海本(Top10),3.95,4.3,337,,,数学+金融双专业,2段顶级投行+1段对冲基金,是,是,实分析+泛函+随机微积分,baruch-mfe,accepted,2025Fall,linkedin
30,海本(Top10),3.95,4.3,337,,,数学+金融双专业,2段顶级投行+1段对冲基金,是,是,实分析+泛函+随机微积分,mit-mfin,accepted,2025Fall,linkedin
id,gender,bg_type,nationality,gpa,gpa_scale,gre,toefl,major,intern_desc,has_paper,has_research,courses_note,program,result,season,source
1,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,baruch-mfe,accepted,2025Fall,quantnet
2,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,princeton-mfin,rejected,2025Fall,quantnet
3,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,cmu-mscf,accepted,2025Fall,quantnet
4,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,mit-mfin,rejected,2025Fall,quantnet
5,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,uchicago-msfm,accepted,2025Fall,quantnet
6,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,gatech-qcf,accepted,2025Fall,quantnet
7,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,nus-qf,accepted,2025Fall,quantnet
8,M,985,中国大陆,3.8,4,332,112,数学,2段量化实习+1段券商研究,是,是,实分析+随机过程+C++,baruch-mfe,accepted,2025Fall,chasedream
9,M,985,中国大陆,3.8,4,332,112,数学,2段量化实习+1段券商研究,是,是,实分析+随机过程+C++,princeton-mfin,rejected,2025Fall,chasedream
10,M,985,中国大陆,3.8,4,332,112,数学,2段量化实习+1段券商研究,是,是,实分析+随机过程+C++,cmu-mscf,accepted,2025Fall,chasedream
11,F,海本(Top30),美籍,3.92,4,335,,,2段顶级量化+1段投行,是,否,数学+CS双专业,baruch-mfe,accepted,2025Fall,linkedin
12,F,海本(Top30),美籍,3.92,4,335,,,2段顶级量化+1段投行,是,否,数学+CS双专业,princeton-mfin,accepted,2025Fall,linkedin
13,F,海本(Top30),美籍,3.92,4,335,,,2段顶级量化+1段投行,是,否,数学+CS双专业,cmu-mscf,accepted,2025Fall,linkedin
14,M,985,中国大陆,3.5,4,325,105,金融,1段银行实习,否,否,,columbia-mafn,rejected,2025Fall,chasedream
15,M,985,中国大陆,3.5,4,325,105,金融,1段银行实习,否,否,,gatech-qcf,accepted,2025Fall,chasedream
16,F,211,中国大陆,3.6,4,328,108,统计,2段数据分析实习,否,是,时间序列+回归分析,nyu-mfe,rejected,2025Fall,quantnet
17,F,211,中国大陆,3.6,4,328,108,统计,2段数据分析实习,否,是,时间序列+回归分析,rutgers-msmf,accepted,2025Fall,quantnet
18,F,211,中国大陆,3.6,4,328,108,统计,2段数据分析实习,否,是,时间序列+回归分析,fordham-msqf,accepted,2025Fall,quantnet
19,M,985,中国大陆,87,100,330,115,金工,3段量化+1段投行,是,是,随机微积分+实分析+ML,baruch-mfe,accepted,2025Fall,offershow
20,M,985,中国大陆,87,100,330,115,金工,3段量化+1段投行,是,是,随机微积分+实分析+ML,nyu-mfe,accepted,2025Fall,offershow
21,M,海本(Top50),港澳台,3.7,4,329,,,1段量化实习+1段fintech,否,否,CS+数学辅修,columbia-mafn,accepted,2025Fall,offershow
22,M,海本(Top50),港澳台,3.7,4,329,,,1段量化实习+1段fintech,否,否,CS+数学辅修,mit-mfin,rejected,2025Fall,offershow
23,M,双非一本,中国大陆,3.8,4,326,102,应用数学,1段量化实习,否,否,概率论+线代+微积分,rutgers-msmf,accepted,2025Fall,chasedream
24,M,双非一本,中国大陆,3.8,4,326,102,应用数学,1段量化实习,否,否,概率论+线代+微积分,baruch-mfe,rejected,2025Fall,chasedream
25,M,985,中国大陆,3.9,4,333,118,计算机,3段量化实习+kaggle金牌,是,是,ML+深度学习+C++,baruch-mfe,accepted,2025Fall,quantnet
26,M,985,中国大陆,3.9,4,333,118,计算机,3段量化实习+kaggle金牌,是,是,ML+深度学习+C++,cmu-mscf,accepted,2025Fall,quantnet
27,M,985,中国大陆,3.9,4,333,118,计算机,3段量化实习+kaggle金牌,是,是,ML+深度学习+C++,princeton-mfin,waitlisted,2025Fall,quantnet
28,F,海本(Top10),美籍,3.95,4.3,337,,,数学+金融双专业,2段顶级投行+1段对冲基金,是,是,实分析+泛函+随机微积分,princeton-mfin,accepted,2025Fall,linkedin
29,F,海本(Top10),美籍,3.95,4.3,337,,,数学+金融双专业,2段顶级投行+1段对冲基金,是,是,实分析+泛函+随机微积分,baruch-mfe,accepted,2025Fall,linkedin
30,F,海本(Top10),美籍,3.95,4.3,337,,,数学+金融双专业,2段顶级投行+1段对冲基金,是,是,实分析+泛函+随机微积分,mit-mfin,accepted,2025Fall,linkedin
2 changes: 1 addition & 1 deletion data/admissions/template.csv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
id,bg_type,gpa,gpa_scale,gre,toefl,major,intern_desc,has_paper,has_research,courses_note,program,result,season,source
id,gender,bg_type,nationality,gpa,gpa_scale,gre,toefl,major,intern_desc,has_paper,has_research,courses_note,program,result,season,source
Loading
Loading