Skip to content

Conversation

@onenoc
Copy link
Member

@onenoc onenoc commented Aug 15, 2025

Summary

  • tag parameters with μP width roles (to_width, from_width, neutral)
  • attach learning rate and init multipliers based on width ratio
  • record dotted megatron_name for each parameter

Testing

  • python -m py_compile user_hooks/mup_axis_tagger.py
  • `python - <<'PY'
    import torch
    from torch import nn
    from user_hooks.mup_axis_tagger import tag_axis_aware

class DummyModel(nn.Module):
def init(self, h):
super().init()
self.layernorm = nn.LayerNorm(h)
self.linear_qkv = nn.Linear(h, 3h, bias=False)
self.linear_proj = nn.Linear(3
h, h, bias=False)
self.bias = nn.Parameter(torch.zeros(h))
self.embed = nn.Embedding(10, h)
self.out_head = nn.Linear(h, 20, bias=False)

class DummyIO(nn.Module):
def init(self, din, dout):
super().init()
self.weight = nn.Parameter(torch.randn(din, dout))
self.input_size = din
self.output_size = dout

class Wrapper(nn.Module):
def init(self, h):
super().init()
self.model = DummyModel(h)
self.conflict = DummyIO(20, h)
self.linear_proj_conflict = self.conflict # alias to give name pattern

h = 16
model = Wrapper(h)
tag_axis_aware(model, hidden_size=h, base_hidden=h)

for n,p in model.named_parameters():
assert hasattr(p,'mup_role')
assert hasattr(p,'mup_lr_mult')
assert hasattr(p,'mup_init_scale')
assert hasattr(p,'megatron_name')
assert p.megatron_name == n

assert model.model.layernorm.weight.mup_role == 'neutral'
assert model.model.layernorm.bias.mup_role == 'neutral'
assert model.model.bias.mup_role == 'neutral'
assert model.model.embed.weight.mup_role == 'from_width'
assert model.model.linear_qkv.weight.mup_role == 'to_width'
assert model.model.linear_proj.weight.mup_role == 'from_width'
assert model.model.out_head.weight.mup_role == 'from_width'

Axis classification overrides name hint "linear_proj_conflict" -> to_width

assert model.linear_proj_conflict.weight.mup_role == 'to_width'

print('ok')
PY`


https://chatgpt.com/codex/tasks/task_e_689d69cd0c0c8325bf5b5969d1147323

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants