diff --git a/cli/main.py b/cli/main.py index d1a6acc..98c1336 100644 --- a/cli/main.py +++ b/cli/main.py @@ -657,6 +657,17 @@ def cmd_stats(args: argparse.Namespace) -> None: table.add_row(prog_id, str(acc), str(rej), str(wl), str(total), rate) console.print(table) + + # Gender & nationality summary + gender = summary.get("gender_dist", {}) + nat = summary.get("nationality_dist", {}) + if gender or nat: + gender_str = f"M:{gender.get('M', 0)} F:{gender.get('F', 0)}" + nat_parts = [f"{k}:{v}" for k, v in sorted(nat.items(), key=lambda x: -x[1])] + console.print( + f" [bold]Demographics:[/bold] Gender: {gender_str} | " + f"Nationality: {', '.join(nat_parts)}" + ) console.print() # GPA distribution diff --git a/core/admission_data.py b/core/admission_data.py index 5741bac..1466efa 100644 --- a/core/admission_data.py +++ b/core/admission_data.py @@ -6,8 +6,9 @@ CSV schema ---------- -id, bg_type, gpa, gpa_scale, gre, toefl, major, intern_desc, -has_paper, has_research, courses_note, program, result, season, source +id, gender, bg_type, nationality, gpa, gpa_scale, gre, toefl, major, +intern_desc, has_paper, has_research, courses_note, program, result, +season, source """ from __future__ import annotations @@ -165,6 +166,60 @@ def classify_background(bg_type: str) -> int: return 4 # default +# --------------------------------------------------------------------------- +# Nationality classification +# --------------------------------------------------------------------------- + +# Canonical nationality values +NATIONALITY_DOMESTIC = "domestic" # US citizen / permanent resident +NATIONALITY_CHINA = "china" # Chinese mainland +NATIONALITY_HK_TW = "hk_tw" # Hong Kong, Macau, Taiwan +NATIONALITY_OTHER_INTL = "other_intl" # Other international + +_NATIONALITY_MAP: dict[str, str] = { + "美籍": NATIONALITY_DOMESTIC, + "美国": NATIONALITY_DOMESTIC, + "us": NATIONALITY_DOMESTIC, + "domestic": NATIONALITY_DOMESTIC, + "greencard": NATIONALITY_DOMESTIC, + "绿卡": NATIONALITY_DOMESTIC, + "pr": NATIONALITY_DOMESTIC, + "中国大陆": NATIONALITY_CHINA, + "中国": NATIONALITY_CHINA, + "大陆": NATIONALITY_CHINA, + "china": NATIONALITY_CHINA, + "mainland": NATIONALITY_CHINA, + "港澳台": NATIONALITY_HK_TW, + "香港": NATIONALITY_HK_TW, + "台湾": NATIONALITY_HK_TW, + "澳门": NATIONALITY_HK_TW, + "hk": NATIONALITY_HK_TW, + "taiwan": NATIONALITY_HK_TW, +} + + +def classify_nationality(nationality: str) -> str: + """Map a nationality string to a canonical value. + + Returns one of: 'domestic', 'china', 'hk_tw', 'other_intl'. + Empty/unknown values return 'china' (most common in MFE applicant pool). + """ + val = nationality.strip().lower().replace(" ", "") + if not val or val in ("不明", "n/a", "unknown"): + return NATIONALITY_CHINA # default for MFE applicant pool + + # Exact match + if val in _NATIONALITY_MAP: + return _NATIONALITY_MAP[val] + + # Partial match + for key, canonical in _NATIONALITY_MAP.items(): + if key in val or val in key: + return canonical + + return NATIONALITY_OTHER_INTL + + # --------------------------------------------------------------------------- # Intern strength scoring # --------------------------------------------------------------------------- @@ -230,8 +285,11 @@ class AdmissionRecord: """A single real applicant data point with normalized fields.""" id: str = "" + gender: str = "" # M / F / empty bg_type: str = "" bg_tier: int = 4 # 1-5, computed from bg_type + nationality: str = "" # raw value + nationality_canonical: str = "" # domestic / china / hk_tw / other_intl gpa_raw: float = 0.0 gpa_scale: float = 4.0 gpa_normalized: float = 0.0 # on 4.0 scale @@ -266,6 +324,8 @@ class ProgramStats: avg_intern_score_accepted: float = 0.0 paper_rate_accepted: float = 0.0 research_rate_accepted: float = 0.0 + female_rate_accepted: float = 0.0 # fraction of female among accepted + nationality_dist_accepted: dict[str, int] = field(default_factory=dict) # Rejected applicant stats avg_gpa_rejected: float = 0.0 @@ -348,11 +408,15 @@ def load_admission_csv(path: str | Path) -> list[AdmissionRecord]: gpa_scale = 4.0 bg_type = row.get("bg_type", "").strip() + nationality_raw = row.get("nationality", "").strip() rec = AdmissionRecord( id=row.get("id", "").strip(), + gender=row.get("gender", "").strip().upper(), bg_type=bg_type, bg_tier=classify_background(bg_type), + nationality=nationality_raw, + nationality_canonical=classify_nationality(nationality_raw), gpa_raw=gpa_raw, gpa_scale=gpa_scale, gpa_normalized=normalize_gpa(gpa_raw, gpa_scale), @@ -454,6 +518,19 @@ def compute_program_stats( if research_known else 0.0 ) + # Gender stats + gendered = [r for r in accepted if r.gender in ("M", "F")] + stats.female_rate_accepted = ( + sum(1 for r in gendered if r.gender == "F") / len(gendered) + if gendered + else 0.0 + ) + # Nationality distribution + nat_dist: dict[str, int] = {} + for r in accepted: + nat = r.nationality_canonical or "unknown" + nat_dist[nat] = nat_dist.get(nat, 0) + 1 + stats.nationality_dist_accepted = nat_dist # Rejected stats if rejected: @@ -530,6 +607,22 @@ def _effect_size(acc_vals: list[float], rej_vals: list[float]) -> float: [1.0 if r.has_research else 0.0 for r in rejected if r.has_research is not None], ) + # Gender (female = 1, male = 0) + acc_gender = [1.0 if r.gender == "F" else 0.0 for r in accepted if r.gender in ("M", "F")] + rej_gender = [1.0 if r.gender == "F" else 0.0 for r in rejected if r.gender in ("M", "F")] + features["gender_f"] = _effect_size(acc_gender, rej_gender) + + # Nationality (domestic = 1, international = 0) + acc_nat = [ + 1.0 if r.nationality_canonical == "domestic" else 0.0 + for r in accepted if r.nationality_canonical + ] + rej_nat = [ + 1.0 if r.nationality_canonical == "domestic" else 0.0 + for r in rejected if r.nationality_canonical + ] + features["domestic"] = _effect_size(acc_nat, rej_nat) + return features @@ -564,6 +657,18 @@ def summarize_records(records: list[AdmissionRecord]) -> dict[str, Any]: seasons = sorted({r.season for r in records if r.season}) sources = sorted({r.source for r in records if r.source}) + # Gender breakdown + gendered = [r for r in records if r.gender in ("M", "F")] + gender_dist = {"M": 0, "F": 0} + for r in gendered: + gender_dist[r.gender] += 1 + + # Nationality breakdown + nat_dist: dict[str, int] = {} + for r in records: + nat = r.nationality_canonical or "unknown" + nat_dist[nat] = nat_dist.get(nat, 0) + 1 + return { "total_records": len(records), "unique_applicants": len({r.id for r in records}), @@ -572,4 +677,6 @@ def summarize_records(records: list[AdmissionRecord]) -> dict[str, Any]: "sources": sources, "avg_gpa_normalized": _safe_avg([r.gpa_normalized for r in records]), "gre_available": sum(1 for r in records if r.gre is not None), + "gender_dist": gender_dist, + "nationality_dist": nat_dist, } diff --git a/core/calibrator.py b/core/calibrator.py index 1a3b1a5..680c516 100644 --- a/core/calibrator.py +++ b/core/calibrator.py @@ -209,28 +209,28 @@ def predict_outcome( score = 0.0 max_score = 0.0 - # GPA component (40%) - weight_gpa = 0.4 + # GPA component (35%) + weight_gpa = 0.35 max_score += weight_gpa if threshold.gpa_target > 0: gpa_ratio = record.gpa_normalized / threshold.gpa_target score += weight_gpa * min(1.0, gpa_ratio) - # Background tier (25%) - weight_bg = 0.25 + # Background tier (20%) + weight_bg = 0.20 max_score += weight_bg if threshold.max_bg_tier_accepted > 0: bg_ratio = 1.0 - (record.bg_tier - 1) / 4.0 # tier 1=1.0, tier 5=0.0 score += weight_bg * max(0.0, bg_ratio) - # Intern score (20%) - weight_intern = 0.2 + # Intern score (18%) + weight_intern = 0.18 max_score += weight_intern if record.intern_score > 0: score += weight_intern * min(1.0, record.intern_score / 8.0) - # Research/paper bonus (15%) - weight_research = 0.15 + # Research/paper bonus (12%) + weight_research = 0.12 max_score += weight_research bonus = 0.0 if record.has_paper: @@ -239,6 +239,29 @@ def predict_outcome( bonus += 0.5 score += weight_research * bonus + # Gender diversity bonus (7%) + # MFE programs skew heavily male; female applicants may benefit + weight_gender = 0.07 + max_score += weight_gender + if record.gender == "F": + score += weight_gender * 1.0 + elif record.gender == "M": + score += weight_gender * 0.4 # baseline, no penalty + + # Nationality / domestic advantage (8%) + # Domestic applicants (US citizens/PR) have slight advantage + weight_nat = 0.08 + max_score += weight_nat + nat = record.nationality_canonical + if nat == "domestic": + score += weight_nat * 1.0 + elif nat == "hk_tw": + score += weight_nat * 0.6 + elif nat == "china": + score += weight_nat * 0.4 # largest applicant pool, most competitive + else: + score += weight_nat * 0.5 + # Classify based on score ratio ratio = score / max_score if max_score > 0 else 0.0 diff --git a/data/admissions/sample.csv b/data/admissions/sample.csv index 2e7e071..6b9c4ba 100644 --- a/data/admissions/sample.csv +++ b/data/admissions/sample.csv @@ -1,31 +1,31 @@ -id,bg_type,gpa,gpa_scale,gre,toefl,major,intern_desc,has_paper,has_research,courses_note,program,result,season,source -1,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,baruch-mfe,accepted,2025Fall,quantnet -2,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,princeton-mfin,rejected,2025Fall,quantnet -3,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,cmu-mscf,accepted,2025Fall,quantnet -4,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,mit-mfin,rejected,2025Fall,quantnet -5,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,uchicago-msfm,accepted,2025Fall,quantnet -6,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,gatech-qcf,accepted,2025Fall,quantnet -7,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,nus-qf,accepted,2025Fall,quantnet -8,985,3.8,4,332,112,数学,2段量化实习+1段券商研究,是,是,实分析+随机过程+C++,baruch-mfe,accepted,2025Fall,chasedream -9,985,3.8,4,332,112,数学,2段量化实习+1段券商研究,是,是,实分析+随机过程+C++,princeton-mfin,rejected,2025Fall,chasedream -10,985,3.8,4,332,112,数学,2段量化实习+1段券商研究,是,是,实分析+随机过程+C++,cmu-mscf,accepted,2025Fall,chasedream -11,海本(Top30),3.92,4,335,,,2段顶级量化+1段投行,是,否,数学+CS双专业,baruch-mfe,accepted,2025Fall,linkedin -12,海本(Top30),3.92,4,335,,,2段顶级量化+1段投行,是,否,数学+CS双专业,princeton-mfin,accepted,2025Fall,linkedin -13,海本(Top30),3.92,4,335,,,2段顶级量化+1段投行,是,否,数学+CS双专业,cmu-mscf,accepted,2025Fall,linkedin -14,985,3.5,4,325,105,金融,1段银行实习,否,否,,columbia-mafn,rejected,2025Fall,chasedream -15,985,3.5,4,325,105,金融,1段银行实习,否,否,,gatech-qcf,accepted,2025Fall,chasedream -16,211,3.6,4,328,108,统计,2段数据分析实习,否,是,时间序列+回归分析,nyu-mfe,rejected,2025Fall,quantnet -17,211,3.6,4,328,108,统计,2段数据分析实习,否,是,时间序列+回归分析,rutgers-msmf,accepted,2025Fall,quantnet -18,211,3.6,4,328,108,统计,2段数据分析实习,否,是,时间序列+回归分析,fordham-msqf,accepted,2025Fall,quantnet -19,985,87,100,330,115,金工,3段量化+1段投行,是,是,随机微积分+实分析+ML,baruch-mfe,accepted,2025Fall,offershow -20,985,87,100,330,115,金工,3段量化+1段投行,是,是,随机微积分+实分析+ML,nyu-mfe,accepted,2025Fall,offershow -21,海本(Top50),3.7,4,329,,,1段量化实习+1段fintech,否,否,CS+数学辅修,columbia-mafn,accepted,2025Fall,offershow -22,海本(Top50),3.7,4,329,,,1段量化实习+1段fintech,否,否,CS+数学辅修,mit-mfin,rejected,2025Fall,offershow -23,双非一本,3.8,4,326,102,应用数学,1段量化实习,否,否,概率论+线代+微积分,rutgers-msmf,accepted,2025Fall,chasedream -24,双非一本,3.8,4,326,102,应用数学,1段量化实习,否,否,概率论+线代+微积分,baruch-mfe,rejected,2025Fall,chasedream -25,985,3.9,4,333,118,计算机,3段量化实习+kaggle金牌,是,是,ML+深度学习+C++,baruch-mfe,accepted,2025Fall,quantnet -26,985,3.9,4,333,118,计算机,3段量化实习+kaggle金牌,是,是,ML+深度学习+C++,cmu-mscf,accepted,2025Fall,quantnet -27,985,3.9,4,333,118,计算机,3段量化实习+kaggle金牌,是,是,ML+深度学习+C++,princeton-mfin,waitlisted,2025Fall,quantnet -28,海本(Top10),3.95,4.3,337,,,数学+金融双专业,2段顶级投行+1段对冲基金,是,是,实分析+泛函+随机微积分,princeton-mfin,accepted,2025Fall,linkedin -29,海本(Top10),3.95,4.3,337,,,数学+金融双专业,2段顶级投行+1段对冲基金,是,是,实分析+泛函+随机微积分,baruch-mfe,accepted,2025Fall,linkedin -30,海本(Top10),3.95,4.3,337,,,数学+金融双专业,2段顶级投行+1段对冲基金,是,是,实分析+泛函+随机微积分,mit-mfin,accepted,2025Fall,linkedin +id,gender,bg_type,nationality,gpa,gpa_scale,gre,toefl,major,intern_desc,has_paper,has_research,courses_note,program,result,season,source +1,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,baruch-mfe,accepted,2025Fall,quantnet +2,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,princeton-mfin,rejected,2025Fall,quantnet +3,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,cmu-mscf,accepted,2025Fall,quantnet +4,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,mit-mfin,rejected,2025Fall,quantnet +5,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,uchicago-msfm,accepted,2025Fall,quantnet +6,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,gatech-qcf,accepted,2025Fall,quantnet +7,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,nus-qf,accepted,2025Fall,quantnet +8,M,985,中国大陆,3.8,4,332,112,数学,2段量化实习+1段券商研究,是,是,实分析+随机过程+C++,baruch-mfe,accepted,2025Fall,chasedream +9,M,985,中国大陆,3.8,4,332,112,数学,2段量化实习+1段券商研究,是,是,实分析+随机过程+C++,princeton-mfin,rejected,2025Fall,chasedream +10,M,985,中国大陆,3.8,4,332,112,数学,2段量化实习+1段券商研究,是,是,实分析+随机过程+C++,cmu-mscf,accepted,2025Fall,chasedream +11,F,海本(Top30),美籍,3.92,4,335,,,2段顶级量化+1段投行,是,否,数学+CS双专业,baruch-mfe,accepted,2025Fall,linkedin +12,F,海本(Top30),美籍,3.92,4,335,,,2段顶级量化+1段投行,是,否,数学+CS双专业,princeton-mfin,accepted,2025Fall,linkedin +13,F,海本(Top30),美籍,3.92,4,335,,,2段顶级量化+1段投行,是,否,数学+CS双专业,cmu-mscf,accepted,2025Fall,linkedin +14,M,985,中国大陆,3.5,4,325,105,金融,1段银行实习,否,否,,columbia-mafn,rejected,2025Fall,chasedream +15,M,985,中国大陆,3.5,4,325,105,金融,1段银行实习,否,否,,gatech-qcf,accepted,2025Fall,chasedream +16,F,211,中国大陆,3.6,4,328,108,统计,2段数据分析实习,否,是,时间序列+回归分析,nyu-mfe,rejected,2025Fall,quantnet +17,F,211,中国大陆,3.6,4,328,108,统计,2段数据分析实习,否,是,时间序列+回归分析,rutgers-msmf,accepted,2025Fall,quantnet +18,F,211,中国大陆,3.6,4,328,108,统计,2段数据分析实习,否,是,时间序列+回归分析,fordham-msqf,accepted,2025Fall,quantnet +19,M,985,中国大陆,87,100,330,115,金工,3段量化+1段投行,是,是,随机微积分+实分析+ML,baruch-mfe,accepted,2025Fall,offershow +20,M,985,中国大陆,87,100,330,115,金工,3段量化+1段投行,是,是,随机微积分+实分析+ML,nyu-mfe,accepted,2025Fall,offershow +21,M,海本(Top50),港澳台,3.7,4,329,,,1段量化实习+1段fintech,否,否,CS+数学辅修,columbia-mafn,accepted,2025Fall,offershow +22,M,海本(Top50),港澳台,3.7,4,329,,,1段量化实习+1段fintech,否,否,CS+数学辅修,mit-mfin,rejected,2025Fall,offershow +23,M,双非一本,中国大陆,3.8,4,326,102,应用数学,1段量化实习,否,否,概率论+线代+微积分,rutgers-msmf,accepted,2025Fall,chasedream +24,M,双非一本,中国大陆,3.8,4,326,102,应用数学,1段量化实习,否,否,概率论+线代+微积分,baruch-mfe,rejected,2025Fall,chasedream +25,M,985,中国大陆,3.9,4,333,118,计算机,3段量化实习+kaggle金牌,是,是,ML+深度学习+C++,baruch-mfe,accepted,2025Fall,quantnet +26,M,985,中国大陆,3.9,4,333,118,计算机,3段量化实习+kaggle金牌,是,是,ML+深度学习+C++,cmu-mscf,accepted,2025Fall,quantnet +27,M,985,中国大陆,3.9,4,333,118,计算机,3段量化实习+kaggle金牌,是,是,ML+深度学习+C++,princeton-mfin,waitlisted,2025Fall,quantnet +28,F,海本(Top10),美籍,3.95,4.3,337,,,数学+金融双专业,2段顶级投行+1段对冲基金,是,是,实分析+泛函+随机微积分,princeton-mfin,accepted,2025Fall,linkedin +29,F,海本(Top10),美籍,3.95,4.3,337,,,数学+金融双专业,2段顶级投行+1段对冲基金,是,是,实分析+泛函+随机微积分,baruch-mfe,accepted,2025Fall,linkedin +30,F,海本(Top10),美籍,3.95,4.3,337,,,数学+金融双专业,2段顶级投行+1段对冲基金,是,是,实分析+泛函+随机微积分,mit-mfin,accepted,2025Fall,linkedin diff --git a/data/admissions/template.csv b/data/admissions/template.csv index 76904e3..53cf106 100644 --- a/data/admissions/template.csv +++ b/data/admissions/template.csv @@ -1 +1 @@ -id,bg_type,gpa,gpa_scale,gre,toefl,major,intern_desc,has_paper,has_research,courses_note,program,result,season,source +id,gender,bg_type,nationality,gpa,gpa_scale,gre,toefl,major,intern_desc,has_paper,has_research,courses_note,program,result,season,source diff --git a/tests/test_admission_data.py b/tests/test_admission_data.py index 35660fa..08baad7 100644 --- a/tests/test_admission_data.py +++ b/tests/test_admission_data.py @@ -11,6 +11,7 @@ from core.admission_data import ( AdmissionRecord, classify_background, + classify_nationality, compute_all_program_stats, compute_program_stats, load_admission_csv, @@ -101,6 +102,44 @@ def test_partial_match(self): assert classify_background("某985高校") == 2 +# =================================================================== +# Nationality classification +# =================================================================== + + +class TestClassifyNationality: + """Tests for classify_nationality().""" + + @pytest.mark.parametrize( + "nationality,expected", + [ + ("美籍", "domestic"), + ("US", "domestic"), + ("green card", "domestic"), + ("绿卡", "domestic"), + ("中国大陆", "china"), + ("中国", "china"), + ("China", "china"), + ("港澳台", "hk_tw"), + ("香港", "hk_tw"), + ("台湾", "hk_tw"), + ("HK", "hk_tw"), + ("韩国", "other_intl"), + ("India", "other_intl"), + ], + ) + def test_known_nationalities(self, nationality, expected): + assert classify_nationality(nationality) == expected + + def test_empty_defaults_to_china(self): + assert classify_nationality("") == "china" + assert classify_nationality("不明") == "china" + + def test_case_insensitive(self): + assert classify_nationality("CHINA") == "china" + assert classify_nationality("Domestic") == "domestic" + + # =================================================================== # Internship scoring # =================================================================== @@ -145,9 +184,10 @@ class TestLoadAdmissionCSV: def _write_csv(self, rows: list[dict], tmp_dir: str) -> str: path = Path(tmp_dir) / "test.csv" fieldnames = [ - "id", "bg_type", "gpa", "gpa_scale", "gre", "toefl", "major", - "intern_desc", "has_paper", "has_research", "courses_note", - "program", "result", "season", "source", + "id", "gender", "bg_type", "nationality", "gpa", "gpa_scale", + "gre", "toefl", "major", "intern_desc", "has_paper", + "has_research", "courses_note", "program", "result", + "season", "source", ] with open(path, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=fieldnames) @@ -160,7 +200,9 @@ def test_load_basic(self, tmp_path): path = self._write_csv( [ { - "id": "1", "bg_type": "985", "gpa": "3.8", "gpa_scale": "4", + "id": "1", "gender": "M", "bg_type": "985", + "nationality": "中国大陆", + "gpa": "3.8", "gpa_scale": "4", "gre": "332", "toefl": "112", "major": "数学", "intern_desc": "2段量化实习", "has_paper": "是", "has_research": "是", "courses_note": "", @@ -176,12 +218,16 @@ def test_load_basic(self, tmp_path): assert records[0].gpa_normalized == 3.8 assert records[0].bg_tier == 2 assert records[0].gre == 332 + assert records[0].gender == "M" + assert records[0].nationality_canonical == "china" def test_skips_pending(self, tmp_path): path = self._write_csv( [ { - "id": "1", "bg_type": "985", "gpa": "3.8", "gpa_scale": "4", + "id": "1", "gender": "", "bg_type": "985", + "nationality": "", + "gpa": "3.8", "gpa_scale": "4", "gre": "", "toefl": "", "major": "", "intern_desc": "", "has_paper": "", "has_research": "", "courses_note": "", "program": "baruch-mfe", "result": "pending", @@ -197,7 +243,9 @@ def test_normalizes_100_scale(self, tmp_path): path = self._write_csv( [ { - "id": "1", "bg_type": "211", "gpa": "91.8", "gpa_scale": "100", + "id": "1", "gender": "F", "bg_type": "211", + "nationality": "美籍", + "gpa": "91.8", "gpa_scale": "100", "gre": "331", "toefl": "110+", "major": "金工", "intern_desc": "", "has_paper": "不明", "has_research": "不明", "courses_note": "", "program": "cmu-mscf", "result": "accepted", @@ -211,6 +259,8 @@ def test_normalizes_100_scale(self, tmp_path): assert 3.7 <= records[0].gpa_normalized <= 3.9 assert records[0].toefl == 110 # stripped '+' assert records[0].has_paper is None # '不明' -> None + assert records[0].gender == "F" + assert records[0].nationality_canonical == "domestic" def test_file_not_found(self): with pytest.raises(FileNotFoundError): @@ -228,22 +278,26 @@ class TestComputeStats: def _make_records(self) -> list[AdmissionRecord]: return [ AdmissionRecord( - id="1", program="baruch-mfe", result="accepted", + id="1", gender="M", nationality_canonical="china", + program="baruch-mfe", result="accepted", gpa_normalized=3.8, gre=332, bg_tier=2, intern_score=7.0, has_paper=True, has_research=True, ), AdmissionRecord( - id="2", program="baruch-mfe", result="accepted", + id="2", gender="F", nationality_canonical="domestic", + program="baruch-mfe", result="accepted", gpa_normalized=3.9, gre=335, bg_tier=1, intern_score=8.0, has_paper=True, has_research=True, ), AdmissionRecord( - id="3", program="baruch-mfe", result="rejected", + id="3", gender="M", nationality_canonical="china", + program="baruch-mfe", result="rejected", gpa_normalized=3.5, gre=325, bg_tier=4, intern_score=2.0, has_paper=False, has_research=False, ), AdmissionRecord( - id="4", program="cmu-mscf", result="accepted", + id="4", gender="M", nationality_canonical="china", + program="cmu-mscf", result="accepted", gpa_normalized=3.85, gre=333, bg_tier=2, intern_score=6.0, has_paper=None, has_research=None, ), @@ -276,9 +330,29 @@ def test_feature_importance_computed(self): # Should have feature importance because there are accepted and rejected assert len(stats.feature_importance) > 0 assert "gpa" in stats.feature_importance + assert "gender_f" in stats.feature_importance + assert "domestic" in stats.feature_importance + + def test_gender_stats(self): + records = self._make_records() + stats = compute_program_stats(records, "baruch-mfe") + # 2 accepted: 1 M + 1 F -> female_rate = 0.5 + assert stats.female_rate_accepted == pytest.approx(0.5, rel=0.01) + + def test_nationality_dist(self): + records = self._make_records() + stats = compute_program_stats(records, "baruch-mfe") + # 2 accepted: 1 china + 1 domestic + assert stats.nationality_dist_accepted["china"] == 1 + assert stats.nationality_dist_accepted["domestic"] == 1 def test_summarize(self): records = self._make_records() summary = summarize_records(records) assert summary["total_records"] == 4 assert "baruch-mfe" in summary["programs"] + assert "gender_dist" in summary + assert summary["gender_dist"]["M"] == 3 + assert summary["gender_dist"]["F"] == 1 + assert "nationality_dist" in summary + assert summary["nationality_dist"]["china"] == 3 diff --git a/tests/test_calibrator.py b/tests/test_calibrator.py index 5dd8ff3..2ff0083 100644 --- a/tests/test_calibrator.py +++ b/tests/test_calibrator.py @@ -26,28 +26,33 @@ def _make_baruch_records() -> list[AdmissionRecord]: return [ # Strong accepted applicants AdmissionRecord( - id="1", program="baruch-mfe", result="accepted", + id="1", gender="M", nationality_canonical="china", + program="baruch-mfe", result="accepted", gpa_normalized=3.9, bg_tier=1, intern_score=8.0, gre=335, has_paper=True, has_research=True, ), AdmissionRecord( - id="2", program="baruch-mfe", result="accepted", + id="2", gender="F", nationality_canonical="domestic", + program="baruch-mfe", result="accepted", gpa_normalized=3.8, bg_tier=2, intern_score=7.0, gre=332, has_paper=True, has_research=True, ), AdmissionRecord( - id="3", program="baruch-mfe", result="accepted", + id="3", gender="M", nationality_canonical="china", + program="baruch-mfe", result="accepted", gpa_normalized=3.85, bg_tier=2, intern_score=6.5, gre=330, has_paper=False, has_research=True, ), # Rejected applicants AdmissionRecord( - id="4", program="baruch-mfe", result="rejected", + id="4", gender="M", nationality_canonical="china", + program="baruch-mfe", result="rejected", gpa_normalized=3.5, bg_tier=4, intern_score=2.0, gre=325, has_paper=False, has_research=False, ), AdmissionRecord( - id="5", program="baruch-mfe", result="rejected", + id="5", gender="M", nationality_canonical="china", + program="baruch-mfe", result="rejected", gpa_normalized=3.6, bg_tier=3, intern_score=3.0, gre=328, has_paper=False, has_research=False, ), @@ -59,22 +64,26 @@ def _make_mixed_records() -> list[AdmissionRecord]: records = _make_baruch_records() records.extend([ AdmissionRecord( - id="6", program="cmu-mscf", result="accepted", + id="6", gender="F", nationality_canonical="domestic", + program="cmu-mscf", result="accepted", gpa_normalized=3.85, bg_tier=2, intern_score=7.0, gre=333, has_paper=True, has_research=True, ), AdmissionRecord( - id="7", program="cmu-mscf", result="rejected", + id="7", gender="M", nationality_canonical="china", + program="cmu-mscf", result="rejected", gpa_normalized=3.4, bg_tier=4, intern_score=1.0, gre=320, has_paper=False, has_research=False, ), AdmissionRecord( - id="8", program="gatech-qcf", result="accepted", + id="8", gender="M", nationality_canonical="china", + program="gatech-qcf", result="accepted", gpa_normalized=3.5, bg_tier=3, intern_score=4.0, gre=325, has_paper=False, has_research=False, ), AdmissionRecord( - id="9", program="gatech-qcf", result="accepted", + id="9", gender="F", nationality_canonical="hk_tw", + program="gatech-qcf", result="accepted", gpa_normalized=3.6, bg_tier=2, intern_score=5.0, gre=328, has_paper=False, has_research=True, ), @@ -174,6 +183,7 @@ def test_strong_applicant_accepted(self): observed_acceptance_rate=0.60, ) record = AdmissionRecord( + gender="F", nationality_canonical="domestic", gpa_normalized=3.9, bg_tier=1, intern_score=8.0, has_paper=True, has_research=True, ) @@ -188,12 +198,40 @@ def test_weak_applicant_rejected(self): observed_acceptance_rate=0.10, ) record = AdmissionRecord( + gender="M", nationality_canonical="china", gpa_normalized=3.2, bg_tier=5, intern_score=0.0, has_paper=False, has_research=False, ) result = predict_outcome(record, threshold) assert result == "rejected" + def test_gender_nationality_influence(self): + """Female domestic applicant should score higher than male intl.""" + threshold = ProgramThreshold( + program_id="test", + gpa_target=3.7, + max_bg_tier_accepted=3, + observed_acceptance_rate=0.50, + ) + # Same stats, different gender + nationality + base = dict( + gpa_normalized=3.7, bg_tier=2, intern_score=5.0, + has_paper=False, has_research=False, + ) + female_domestic = AdmissionRecord( + gender="F", nationality_canonical="domestic", **base, + ) + male_china = AdmissionRecord( + gender="M", nationality_canonical="china", **base, + ) + # female_domestic should not score lower than male_china + r1 = predict_outcome(female_domestic, threshold) + r2 = predict_outcome(male_china, threshold) + # Both may be accepted, but female_domestic should not be rejected + # if male_china is accepted + if r2 == "accepted": + assert r1 == "accepted" + # =================================================================== # generate_ranker_overrides