diff --git a/Makefile b/Makefile index 88a7158..249ea15 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ OBJDIR = obj SRC = main.c util.c parse.c cfg.c mem.c ssa.c alias.c load.c copy.c \ fold.c live.c spill.c rega.c gas.c -AMD64SRC = amd64/targ.c amd64/sysv.c amd64/isel.c amd64/emit.c +AMD64SRC = amd64/targ.c amd64/sysv.c amd64/win64.c amd64/isel.c amd64/emit.c amd64/win64_emit.c ARM64SRC = arm64/targ.c arm64/abi.c arm64/isel.c arm64/emit.c SRCALL = $(SRC) $(AMD64SRC) $(ARM64SRC) diff --git a/amd64/all.h b/amd64/all.h index 3a2db0e..db26bf8 100644 --- a/amd64/all.h +++ b/amd64/all.h @@ -63,6 +63,14 @@ bits amd64_sysv_retregs(Ref, int[2]); bits amd64_sysv_argregs(Ref, int[2]); void amd64_sysv_abi(Fn *); +/* win64.c (abi) */ +extern int amd64_win64_rsave[]; +extern int amd64_win64_rclob[]; +bits amd64_win64_retregs(Ref, int[2]); +bits amd64_win64_argregs(Ref, int[2]); +void amd64_win64_abi(Fn *); +void amd64_win64_emitfn(Fn *, FILE *); + /* isel.c */ void amd64_isel(Fn *); diff --git a/amd64/targ.c b/amd64/targ.c index e227574..a451913 100644 --- a/amd64/targ.c +++ b/amd64/targ.c @@ -28,3 +28,20 @@ Target T_amd64_sysv = { .isel = amd64_isel, .emitfn = amd64_emitfn, }; + +Target T_amd64_win64 = { + .gpr0 = RAX, + .ngpr = NGPR, + .fpr0 = XMM0, + .nfpr = NFPR, + .rglob = BIT(RBP) | BIT(RSP), + .nrglob = 2, + .rsave = amd64_win64_rsave, + .nrsave = {NGPS, NFPS}, + .retregs = amd64_win64_retregs, + .argregs = amd64_win64_argregs, + .memargs = amd64_memargs, + .abi = amd64_win64_abi, + .isel = amd64_isel, + .emitfn = amd64_win64_emitfn, +}; diff --git a/amd64/win64.c b/amd64/win64.c new file mode 100644 index 0000000..aa19873 --- /dev/null +++ b/amd64/win64.c @@ -0,0 +1,617 @@ +#include "all.h" + +/* Windows x64 calling convention summary: + * - Four register argument slots: RCX, RDX, R8, R9 (GPR) or XMM0..3 (FP) chosen per argument type. + * - Caller must reserve 32 bytes of shadow space for every call. + * - Return: scalars in RAX/XMM0. Aggregates > 8 bytes use a hidden sret pointer in RCX; callee returns that pointer in RAX. + * - We currently treat all XMM registers as caller-saved (safe, but not optimal) to avoid changing the shared register model. + * - Varargs use the shadow space to access register arguments. + */ + +typedef struct WArgClass WArgClass; + +struct WArgClass { + int inmem; + int align; + uint size; + int cls; /* Kw/Kl/Ks/Kd */ +}; + +static int win64_gpr_arg[] = {RCX, RDX, R8, R9}; +static int win64_fpr_arg[] = {XMM0, XMM1, XMM2, XMM3}; + +enum { Win64Shadow = 32 }; + +int amd64_win64_rsave[] = { + RCX, RDX, R8, R9, R10, R11, RAX, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, + -1 +}; + +/* Only GPR callee-saves are modeled to match the shared emitter/reg model. */ +int amd64_win64_rclob[] = { + RBX, RBP, RSI, RDI, R12, R13, R14, R15, + XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, + -1 +}; + +static uint +win64_align8(uint n) +{ + return (n + 7u) & -8u; +} + +static uint +win64_align16(uint n) +{ + return (n + 15u) & ~15u; +} + +static int +win64_stackslot(uint off) +{ + return -(int)((off + 16u) / 4u); +} + +static int +win64_storeop(int cls) +{ + static int st[] = { Ostorew, Ostorel, Ostores, Ostored }; + + if (cls < Kw || cls > Kd) + die("invalid win64 store class"); + return st[cls]; +} + +static int +win64_popcnt4(int m) +{ + return (m & 1) + ((m >> 1) & 1) + ((m >> 2) & 1) + ((m >> 3) & 1); +} + +static void +win64_typclass(WArgClass *a, Typ *t) +{ + uint al; + + a->inmem = 0; + a->cls = Kl; + + al = 1u << t->align; + if (al > 8) + al = 8; + a->size = win64_align8(t->size); + a->align = t->align; + + if (t->dark || a->size == 0 || a->size > 8) { + a->inmem = 1; + a->cls = Kl; + return; + } + if (a->size <= 4) + a->cls = Kw; +} + +static int +argclass(Ins *i0, Ins *i1, WArgClass *ac, WArgClass *aret, Ref *env) +{ + int slot, nslot; + Ins *i; + WArgClass *a; + + slot = (aret && aret->inmem) ? 1 : 0; + nslot = 0; + for (i=i0, a=ac; iop) { + case Oarg: + case Opar: + a->inmem = slot >= 4; + a->align = 3; + a->size = 8; + a->cls = i->cls; + if (a->inmem) + nslot++; + break; + case Oargc: + case Oparc: + win64_typclass(a, &typ[i->arg[0].val]); + if (slot >= 4) + nslot += a->inmem ? 1 : (a->size + 7) / 8; + break; + case Oarge: + slot--; + if (env) + *env = i->arg[0]; + break; + case Opare: + slot--; + if (env) + *env = i->to; + break; + } + } + + return nslot; +} + +bits +amd64_win64_retregs(Ref r, int p[2]) +{ + bits b; + int ni, nf; + + assert(rtype(r) == RCall); + b = 0; + ni = r.val & 3; + nf = (r.val >> 2) & 3; + if (ni >= 1) + b |= BIT(RAX); + if (nf >= 1) + b |= BIT(XMM0); + if (p) { + p[0] = ni; + p[1] = nf; + } + return b; +} + +bits +amd64_win64_argregs(Ref r, int p[2]) +{ + bits b; + int j, gpm, fpm, ra; + + assert(rtype(r) == RCall); + b = 0; + gpm = (r.val >> 4) & 15; + fpm = (r.val >> 8) & 15; + ra = (r.val >> 12) & 1; + for (j=0; j<4; j++) + if (gpm & (1 << j)) + b |= BIT(win64_gpr_arg[j]); + for (j=0; j<4; j++) + if (fpm & (1 << j)) + b |= BIT(win64_fpr_arg[j]); + if (ra) + b |= BIT(RAX); + if (p) { + p[0] = win64_popcnt4(gpm) + ra; + p[1] = win64_popcnt4(fpm); + } + return b; +} + +static Ref +win64_rarg(int cls, int slot) +{ + if (KBASE(cls) == 0) + return TMP(win64_gpr_arg[slot]); + return TMP(win64_fpr_arg[slot]); +} + +static void +selret(Blk *b, Fn *fn) +{ + int j, k, ca; + Ref r0; + WArgClass aret; + + j = b->jmp.type; + if (!isret(j) || j == Jret0) + return; + + r0 = b->jmp.arg; + b->jmp.type = Jret0; + ca = 0; + + if (j == Jretc) { + win64_typclass(&aret, &typ[fn->retty]); + if (aret.inmem) { + assert(rtype(fn->retr) == RTmp); + emit(Ocopy, Kl, TMP(RAX), fn->retr, R); + blit(fn->retr, 0, r0, aret.size, fn); + ca = 1; + } else { + emit(Oload, aret.cls, TMP(RAX), r0, R); + ca = 1; + } + } else { + k = j - Jretw; + if (KBASE(k) == 0) { + emit(Ocopy, k, TMP(RAX), r0, R); + ca = 1; + } else { + emit(Ocopy, k, TMP(XMM0), r0, R); + ca = 1 << 2; + } + } + + b->jmp.arg = CALL(ca); +} + +static void +selcall(Fn *fn, Ins *i0, Ins *i1) +{ + Ins *i; + WArgClass *ac, *a, aret; + int ca, slot, gpm, fpm, envc, varc; + uint off, stk; + Ref r, r1, rstk, retbuf, env; + int al; + int hasstk, sslot; + + varc = i1->op == Ovacall; + env = R; + ac = alloc((i1-i0) * sizeof ac[0]); + if (!req(i1->arg[1], R)) { + assert(rtype(i1->arg[1]) == RType); + win64_typclass(&aret, &typ[i1->arg[1].val]); + } else { + memset(&aret, 0, sizeof aret); + aret.inmem = 0; + aret.size = 8; + aret.cls = i1->cls; + } + + argclass(i0, i1, ac, aret.inmem ? &aret : 0, &env); + + hasstk = 0; + off = Win64Shadow; + sslot = (!req(i1->arg[1], R) && aret.inmem) ? 1 : 0; + for (i=i0, a=ac; iop == Oarge) + continue; + if (sslot >= 4) { + off += 8; + hasstk = 1; + } + sslot++; + } + stk = win64_align16(off); + + ca = 0; + slot = 0; + gpm = 0; + fpm = 0; + retbuf = R; + + if (!req(i1->arg[1], R)) { + retbuf = newtmp("abi", Kl, fn); + if (aret.inmem) { + ca |= 1; /* gp return */ + gpm |= 1; + slot++; /* hidden sret consumes RCX */ + } else if (KBASE(aret.cls) == 0) { + ca |= 1; + } else { + ca |= 1 << 2; + } + } else { + if (KBASE(i1->cls) == 0) + ca |= 1; + else + ca |= 1 << 2; + } + + for (i=i0, a=ac; iop) { + case Oarg: + if (slot < 4 && !a->inmem) { + if (KBASE(i->cls) == 0) + gpm |= 1 << slot; + else + fpm |= 1 << slot; + } + slot++; + break; + case Oargc: + if (slot < 4) { + if (a->inmem) { + gpm |= 1 << slot; + } else { + if (a->size > 8) + err("win64 abi cannot pass aggregates >8 bytes in registers"); + if (KBASE(a->cls) == 0) + gpm |= 1 << slot; + else + fpm |= 1 << slot; + } + } + slot++; + break; + case Oarge: + break; + default: + die("unreachable"); + } + } + + envc = !req(R, env); + ca |= (gpm << 4) | (fpm << 8); + if (envc) + ca |= 1 << 12; + + r = R; + if (hasstk || varc) + r = newtmp("abi", Kl, fn); + + if (stk) { + rstk = getcon(-(int64_t)stk, fn); + emit(Osalloc, Kl, R, rstk, R); + } + + if (!req(i1->arg[1], R)) { + if (aret.inmem) { + emit(Ocopy, Kl, i1->to, TMP(RAX), R); + } else { + Ref tmp; + + tmp = newtmp("abi", aret.cls, fn); + if (KBASE(aret.cls) == 0) + emit(win64_storeop(aret.cls), 0, R, tmp, retbuf); + else + emit(win64_storeop(aret.cls), 0, R, tmp, retbuf); + if (KBASE(aret.cls) == 0) + emit(Ocopy, aret.cls, tmp, TMP(RAX), R); + else + emit(Ocopy, aret.cls, tmp, TMP(XMM0), R); + emit(Ocopy, Kl, i1->to, retbuf, R); + } + } else { + if (KBASE(i1->cls) == 0) + emit(Ocopy, i1->cls, i1->to, TMP(RAX), R); + else + emit(Ocopy, i1->cls, i1->to, TMP(XMM0), R); + } + + emit(Ocall, i1->cls, R, i1->arg[0], CALL(ca)); + + if (envc) + emit(Ocopy, Kl, TMP(RAX), env, R); + + slot = (!req(i1->arg[1], R) && aret.inmem) ? 1 : 0; + off = Win64Shadow; + + for (i=i0, a=ac; iop) { + case Oarg: + if (slot < 4 && !a->inmem) { + emit(Ocopy, i->cls, win64_rarg(i->cls, slot), i->arg[0], R); + if (varc) { + r1 = newtmp("abi", Kl, fn); + emit(win64_storeop(i->cls), 0, R, i->arg[0], r1); + emit(Oadd, Kl, r1, r, getcon(slot * 8, fn)); + } + } else { + r1 = newtmp("abi", Kl, fn); + emit(win64_storeop(i->cls), 0, R, i->arg[0], r1); + emit(Oadd, Kl, r1, r, getcon(off, fn)); + off += 8; + } + slot++; + break; + case Oargc: + if (a->inmem) { + if (slot < 4) { + emit(Ocopy, Kl, win64_rarg(Kl, slot), i->arg[1], R); + if (varc) { + r1 = newtmp("abi", Kl, fn); + emit(Ostorel, 0, R, i->arg[1], r1); + emit(Oadd, Kl, r1, r, getcon(slot * 8, fn)); + } + } else { + r1 = newtmp("abi", Kl, fn); + emit(Ostorel, 0, R, i->arg[1], r1); + emit(Oadd, Kl, r1, r, getcon(off, fn)); + off += 8; + } + } else if (slot < 4) { + if (a->size > 8) + err("win64 abi cannot pass aggregates >8 bytes in registers"); + emit(Oload, a->cls, win64_rarg(a->cls, slot), i->arg[1], R); + if (varc) { + Ref tmp; + + tmp = newtmp("abi", a->cls, fn); + r1 = newtmp("abi", Kl, fn); + emit(win64_storeop(a->cls), 0, R, tmp, r1); + emit(Oadd, Kl, r1, r, getcon(slot * 8, fn)); + emit(Oload, a->cls, tmp, i->arg[1], R); + } + } else { + blit(r, off, i->arg[1], a->size, fn); + off += 8; + } + slot++; + break; + case Oarge: + break; + default: + die("unreachable"); + } + } + + if (!req(i1->arg[1], R)) { + al = aret.align >= 2 ? aret.align - 2 : 0; + if (aret.inmem) + emit(Ocopy, Kl, win64_rarg(Kl, 0), retbuf, R); + emit(Oalloc + al, Kl, retbuf, getcon(aret.size, fn), R); + } + + if (stk) + emit(Osalloc, Kl, r, getcon(stk, fn), R); +} + +static uint +selpar(Fn *fn, Ins *i0, Ins *i1) +{ + WArgClass *ac, aret, *a; + Ins *i; + int slot; + uint off, vaoff; + Ref r, env; + int al; + + env = R; + ac = alloc((i1-i0) * sizeof ac[0]); + curi = &insb[NIns]; + if (fn->retty >= 0) + win64_typclass(&aret, &typ[fn->retty]); + else { + memset(&aret, 0, sizeof aret); + aret.inmem = 0; + } + + argclass(i0, i1, ac, aret.inmem ? &aret : 0, &env); + + slot = 0; + off = Win64Shadow; + + if (fn->retty >= 0 && aret.inmem) { + fn->retr = TMP(RCX); + slot++; + } + + for (i=i0, a=ac; iop) { + case Opar: + if (slot < 4 && !a->inmem) { + r = win64_rarg(i->cls, slot); + emit(Ocopy, i->cls, i->to, r, R); + } else { + emit(Oload, i->cls, i->to, SLOT(win64_stackslot(off)), R); + off += 8; + } + slot++; + break; + case Oparc: + if (a->inmem) { + if (slot < 4) { + r = win64_rarg(Kl, slot); + emit(Ocopy, Kl, i->to, r, R); + } else { + emit(Oload, Kl, i->to, SLOT(win64_stackslot(off)), R); + off += 8; + } + } else if (slot >= 4) { + fn->tmp[i->to.val].slot = win64_stackslot(off); + off += 8; + } else { + if (a->size > 8) + err("win64 abi cannot pass aggregates >8 bytes in registers"); + r = win64_rarg(a->cls, slot); + emit(win64_storeop(a->cls), 0, R, r, i->to); + al = a->align >= 2 ? a->align - 2 : 0; + emit(Oalloc + al, Kl, i->to, getcon(a->size, fn), R); + } + slot++; + break; + case Opare: + break; + default: + die("unreachable"); + } + } + + if (!req(R, env)) + emit(Ocopy, Kl, env, TMP(RAX), R); + + if (!fn->vararg) + return 0; + + if (slot < 4) + vaoff = slot * 8u; + else + vaoff = off; + + return vaoff; +} + +static void +selvastart(Fn *fn, uint vaoff, Ref ap) +{ + Ref r0; + + r0 = newtmp("abi", Kl, fn); + emit(Ostorel, Kw, R, r0, ap); + emit(Oadd, Kl, r0, TMP(RBP), getcon(16 + (int64_t)vaoff, fn)); +} + +static void +selvaarg(Fn *fn, Ins *i) +{ + Ref ap, cur, next, c8; + + ap = i->arg[0]; + c8 = getcon(8, fn); + cur = newtmp("abi", Kl, fn); + next = newtmp("abi", Kl, fn); + + emit(Ostorel, Kw, R, next, ap); + emit(Oadd, Kl, next, cur, c8); + emit(Oload, i->cls, i->to, cur, R); + emit(Oload, Kl, cur, ap, R); +} + +void +amd64_win64_abi(Fn *fn) +{ + Blk *b; + Ins *i, *i0, *ip; + uint vaoff; + int n; + + for (b=fn->start; b; b=b->link) + b->visit = 0; + + for (b=fn->start, i=b->ins; i-b->insnins; i++) + if (!ispar(i->op)) + break; + vaoff = selpar(fn, b->ins, i); + n = b->nins - (i - b->ins) + (&insb[NIns] - curi); + i0 = alloc(n * sizeof(Ins)); + ip = icpy(ip = i0, curi, &insb[NIns] - curi); + ip = icpy(ip, i, &b->ins[b->nins] - i); + b->nins = n; + b->ins = i0; + + b = fn->start; + do { + if (!(b = b->link)) + b = fn->start; + if (b->visit) + continue; + curi = &insb[NIns]; + selret(b, fn); + for (i=&b->ins[b->nins]; i!=b->ins; ) + switch ((--i)->op) { + default: + emiti(*i); + break; + case Ocall: + case Ovacall: + for (i0=i; i0>b->ins; i0--) + if (!isarg((i0-1)->op)) + break; + selcall(fn, i0, i); + i = i0; + break; + case Oarg: + case Oargc: + die("unreachable"); + case Ovastart: + selvastart(fn, vaoff, i->arg[0]); + break; + case Ovaarg: + selvaarg(fn, i); + break; + } + b->nins = &insb[NIns] - curi; + idup(&b->ins, curi, b->nins); + } while (b != fn->start); + + if (debug['A']) { + fprintf(stderr, "\n> After ABI lowering (win64):\n"); + printfn(fn, stderr); + } +} diff --git a/amd64/win64_emit.c b/amd64/win64_emit.c new file mode 100644 index 0000000..1ea38fe --- /dev/null +++ b/amd64/win64_emit.c @@ -0,0 +1,569 @@ +#include "all.h" + +/* This emitter mirrors amd64/emit.c but uses the Win64 callee-save set. */ + +#define CMP(X) \ + X(Ciule, "be") \ + X(Ciult, "b") \ + X(Cisle, "le") \ + X(Cislt, "l") \ + X(Cisgt, "g") \ + X(Cisge, "ge") \ + X(Ciugt, "a") \ + X(Ciuge, "ae") \ + X(Cieq, "z") \ + X(Cine, "nz") \ + X(NCmpI+Cfle, "be") \ + X(NCmpI+Cflt, "b") \ + X(NCmpI+Cfgt, "a") \ + X(NCmpI+Cfge, "ae") \ + X(NCmpI+Cfeq, "z") \ + X(NCmpI+Cfne, "nz") \ + X(NCmpI+Cfo, "np") \ + X(NCmpI+Cfuo, "p") + +enum { + SLong = 0, + SWord = 1, + SShort = 2, + SByte = 3, + + Ki = -1, /* matches Kw and Kl */ + Ka = -2, /* matches all classes */ +}; + +static struct { + short op; + short cls; + char *asm; +} omap_win64[] = { + { Oadd, Ka, "+add%k %1, %=" }, + { Osub, Ka, "-sub%k %1, %=" }, + { Oand, Ki, "+and%k %1, %=" }, + { Oor, Ki, "+or%k %1, %=" }, + { Oxor, Ki, "+xor%k %1, %=" }, + { Osar, Ki, "-sar%k %B1, %=" }, + { Oshr, Ki, "-shr%k %B1, %=" }, + { Oshl, Ki, "-shl%k %B1, %=" }, + { Omul, Ki, "+imul%k %1, %=" }, + { Omul, Ks, "+mulss %1, %=" }, + { Omul, Kd, "+mulsd %1, %=" }, + { Odiv, Ka, "-div%k %1, %=" }, + { Ostorel, Ka, "movq %L0, %M1" }, + { Ostorew, Ka, "movl %W0, %M1" }, + { Ostoreh, Ka, "movw %H0, %M1" }, + { Ostoreb, Ka, "movb %B0, %M1" }, + { Ostores, Ka, "movss %S0, %M1" }, + { Ostored, Ka, "movsd %D0, %M1" }, + { Oload, Ka, "mov%k %M0, %=" }, + { Oloadsw, Kl, "movslq %M0, %L=" }, + { Oloadsw, Kw, "movl %M0, %W=" }, + { Oloaduw, Ki, "movl %M0, %W=" }, + { Oloadsh, Ki, "movsw%k %M0, %=" }, + { Oloaduh, Ki, "movzw%k %M0, %=" }, + { Oloadsb, Ki, "movsb%k %M0, %=" }, + { Oloadub, Ki, "movzb%k %M0, %=" }, + { Oextsw, Kl, "movslq %W0, %L=" }, + { Oextuw, Kl, "movl %W0, %W=" }, + { Oextsh, Ki, "movsw%k %H0, %=" }, + { Oextuh, Ki, "movzw%k %H0, %=" }, + { Oextsb, Ki, "movsb%k %B0, %=" }, + { Oextub, Ki, "movzb%k %B0, %=" }, + + { Oexts, Kd, "cvtss2sd %0, %=" }, + { Otruncd, Ks, "cvttsd2ss %0, %=" }, + { Ostosi, Ki, "cvttss2si%k %0, %=" }, + { Odtosi, Ki, "cvttsd2si%k %0, %=" }, + { Oswtof, Ka, "cvtsi2%k %W0, %=" }, + { Osltof, Ka, "cvtsi2%k %L0, %=" }, + { Ocast, Ki, "movq %D0, %L=" }, + { Ocast, Ka, "movq %L0, %D=" }, + + { Oaddr, Ki, "lea%k %M0, %=" }, + { Oswap, Ki, "xchg%k %0, %1" }, + { Osign, Kl, "cqto" }, + { Osign, Kw, "cltd" }, + { Oxdiv, Ki, "div%k %0" }, + { Oxidiv, Ki, "idiv%k %0" }, + { Oxcmp, Ks, "comiss %S0, %S1" }, + { Oxcmp, Kd, "comisd %D0, %D1" }, + { Oxcmp, Ki, "cmp%k %0, %1" }, + { Oxtest, Ki, "test%k %0, %1" }, +#define X(c, s) \ + { Oflag+c, Ki, "set" s " %B=\n\tmovzb%k %B=, %=" }, + CMP(X) +#undef X + { NOp, 0, 0 } +}; + +static char *rname_win64[][4] = { + [RAX] = {"rax", "eax", "ax", "al"}, + [RBX] = {"rbx", "ebx", "bx", "bl"}, + [RCX] = {"rcx", "ecx", "cx", "cl"}, + [RDX] = {"rdx", "edx", "dx", "dl"}, + [RSI] = {"rsi", "esi", "si", "sil"}, + [RDI] = {"rdi", "edi", "di", "dil"}, + [RBP] = {"rbp", "ebp", "bp", "bpl"}, + [RSP] = {"rsp", "esp", "sp", "spl"}, + [R8 ] = {"r8" , "r8d", "r8w", "r8b"}, + [R9 ] = {"r9" , "r9d", "r9w", "r9b"}, + [R10] = {"r10", "r10d", "r10w", "r10b"}, + [R11] = {"r11", "r11d", "r11w", "r11b"}, + [R12] = {"r12", "r12d", "r12w", "r12b"}, + [R13] = {"r13", "r13d", "r13w", "r13b"}, + [R14] = {"r14", "r14d", "r14w", "r14b"}, + [R15] = {"r15", "r15d", "r15w", "r15b"}, +}; + +static int +slot(int s, Fn *fn) +{ + struct { int i:29; } x; + + x.i = s; + assert(x.i <= fn->slot); + if (x.i < 0) + return -4 * x.i; + else + return -4 * (fn->slot - x.i); +} + +static char * +regtoa(int reg, int sz) +{ + static char buf[6]; + + if (reg >= XMM0) { + sprintf(buf, "xmm%d", reg-XMM0); + return buf; + } else + return rname_win64[reg][sz]; +} + +static void emitins(Ins, Fn *, FILE *); + +static void +emitcopy(Ref r1, Ref r2, int k, Fn *fn, FILE *f) +{ + Ins icp; + + icp.op = Ocopy; + icp.arg[0] = r2; + icp.to = r1; + icp.cls = k; + emitins(icp, fn, f); +} + +static void +emitcon(Con *con, FILE *f) +{ + char *p; + + switch (con->type) { + case CAddr: + p = con->local ? gasloc : gassym; + fprintf(f, "%s%s", p, str(con->label)); + if (con->bits.i) + fprintf(f, "%+"PRId64, con->bits.i); + break; + case CBits: + fprintf(f, "%"PRId64, con->bits.i); + break; + default: + die("unreachable"); + } +} + +static Ref +getarg(char c, Ins *i) +{ + switch (c) { + case '0': + return i->arg[0]; + case '1': + return i->arg[1]; + case '=': + return i->to; + default: + die("invalid arg letter %c", c); + } +} + +static void +emitf(char *s, Ins *i, Fn *fn, FILE *f) +{ + static char clstoa[][3] = {"l", "q", "ss", "sd"}; + char c; + int sz; + Ref ref; + Mem *m; + Con off; + + switch (*s) { + case '+': + if (req(i->arg[1], i->to)) { + ref = i->arg[0]; + i->arg[0] = i->arg[1]; + i->arg[1] = ref; + } + /* fall through */ + case '-': + assert((!req(i->arg[1], i->to) || req(i->arg[0], i->to)) && + "cannot convert to 2-address"); + emitcopy(i->to, i->arg[0], i->cls, fn, f); + s++; + break; + } + + fputc('\t', f); +Next: + while ((c = *s++) != '%') + if (!c) { + fputc('\n', f); + return; + } else + fputc(c, f); + switch ((c = *s++)) { + case '%': + fputc('%', f); + break; + case 'k': + fputs(clstoa[i->cls], f); + break; + case '0': + case '1': + case '=': + sz = KWIDE(i->cls) ? SLong : SWord; + s--; + goto Ref; + case 'D': + case 'S': + sz = SLong; + Ref: + c = *s++; + ref = getarg(c, i); + switch (rtype(ref)) { + case RTmp: + assert(isreg(ref)); + fprintf(f, "%%%s", regtoa(ref.val, sz)); + break; + case RSlot: + fprintf(f, "%d(%%rbp)", slot(ref.val, fn)); + break; + case RMem: + Mem: + m = &fn->mem[ref.val]; + if (rtype(m->base) == RSlot) { + off.type = CBits; + off.bits.i = slot(m->base.val, fn); + addcon(&m->offset, &off); + m->base = TMP(RBP); + } + if (m->offset.type != CUndef) + emitcon(&m->offset, f); + fputc('(', f); + if (req(m->base, R)) + fprintf(f, "%%rip"); + else + fprintf(f, "%%%s", regtoa(m->base.val, SLong)); + if (!req(m->index, R)) + fprintf(f, ", %%%s, %d", + regtoa(m->index.val, SLong), + m->scale + ); + fputc(')', f); + break; + case RCon: + fputc('$', f); + emitcon(&fn->con[ref.val], f); + break; + default: + die("unreachable"); + } + break; + case 'L': + sz = SLong; + goto Ref; + case 'W': + sz = SWord; + goto Ref; + case 'H': + sz = SShort; + goto Ref; + case 'B': + sz = SByte; + goto Ref; + case 'M': + c = *s++; + ref = getarg(c, i); + switch (rtype(ref)) { + case RMem: + goto Mem; + case RSlot: + fprintf(f, "%d(%%rbp)", slot(ref.val, fn)); + break; + case RCon: + emitcon(&fn->con[ref.val], f); + fprintf(f, "(%%rip)"); + break; + case RTmp: + assert(isreg(ref)); + fprintf(f, "(%%%s)", regtoa(ref.val, SLong)); + break; + default: + die("unreachable"); + } + break; + default: + die("invalid format specifier %%%c", c); + } + goto Next; +} + +static void *negmask[4] = { + [Ks] = (uint32_t[4]){ 0x80000000 }, + [Kd] = (uint64_t[2]){ 0x8000000000000000 }, +}; + +static void +emitins(Ins i, Fn *fn, FILE *f) +{ + Ref r; + int64_t val; + int o; + + switch (i.op) { + default: + Table: + for (o=0;; o++) { + if (omap_win64[o].op == NOp) + die("no match for %s(%d)", + optab[i.op].name, "wlsd"[i.cls]); + if (omap_win64[o].op == i.op) + if (omap_win64[o].cls == i.cls + || (omap_win64[o].cls == Ki && KBASE(i.cls) == 0) + || (omap_win64[o].cls == Ka)) + break; + } + emitf(omap_win64[o].asm, &i, fn, f); + break; + case Onop: + break; + case Omul: + if (rtype(i.arg[1]) == RCon) { + r = i.arg[0]; + i.arg[0] = i.arg[1]; + i.arg[1] = r; + } + if (KBASE(i.cls) == 0 + && rtype(i.arg[0]) == RCon + && rtype(i.arg[1]) == RTmp) { + emitf("imul%k %0, %1, %=", &i, fn, f); + break; + } + goto Table; + case Osub: + if (req(i.to, i.arg[1])) { + if (KBASE(i.cls) == 0) + emitf("neg%k %=", &i, fn, f); + else + fprintf(f, + "\txorp%c %sfp%d(%%rip), %%%s\n", + "xxsd"[i.cls], + gasloc, + gasstash(negmask[i.cls], 16), + regtoa(i.to.val, SLong) + ); + emitf("add%k %0, %=", &i, fn, f); + break; + } + goto Table; + case Odiv: + if (req(i.to, i.arg[1])) { + i.arg[1] = TMP(XMM0+15); + emitf("mov%k %=, %1", &i, fn, f); + emitf("mov%k %0, %=", &i, fn, f); + i.arg[0] = i.to; + } + goto Table; + case Ocopy: + if (req(i.to, R) || req(i.arg[0], R)) + break; + if (isreg(i.to) + && rtype(i.arg[0]) == RCon + && i.cls == Kl + && fn->con[i.arg[0].val].type == CBits + && (val = fn->con[i.arg[0].val].bits.i) >= 0 + && val <= UINT32_MAX) { + emitf("movl %W0, %W=", &i, fn, f); + } else if (isreg(i.to) + && rtype(i.arg[0]) == RCon + && fn->con[i.arg[0].val].type == CAddr) { + emitf("lea%k %M0, %=", &i, fn, f); + } else if (!req(i.arg[0], i.to)) + emitf("mov%k %0, %=", &i, fn, f); + break; + case Ocall: + switch (rtype(i.arg[0])) { + case RCon: + fprintf(f, "\tcallq "); + emitcon(&fn->con[i.arg[0].val], f); + fprintf(f, "\n"); + break; + case RTmp: + emitf("callq *%L0", &i, fn, f); + break; + default: + die("invalid call argument"); + } + break; + case Osalloc: + emitf("subq %L0, %%rsp", &i, fn, f); + if (!req(i.to, R)) + emitcopy(i.to, TMP(RSP), Kl, fn, f); + break; + case Oswap: + if (KBASE(i.cls) == 0) + goto Table; + emitcopy(TMP(XMM0+15), i.arg[0], i.cls, fn, f); + emitcopy(i.arg[0], i.arg[1], i.cls, fn, f); + emitcopy(i.arg[1], TMP(XMM0+15), i.cls, fn, f); + break; + } +} + +static uint64_t +framesz(Fn *fn, uint *save_gpr, uint *save_xmm, uint64_t *localsz) +{ + uint64_t f; + int *r; + + *save_gpr = 0; + *save_xmm = 0; + for (r=amd64_win64_rclob; *r>=0; r++) { + if (*r >= XMM0) + *save_xmm += !!(fn->reg & BIT(*r)); + else + *save_gpr += !!(fn->reg & BIT(*r)); + } + + f = fn->slot; + f = (f + 3) & -4; + f = 4*f; + *localsz = f; + f += 8 * (*save_gpr); + f += 16 * (*save_xmm); + f = (f + 15) & -16; + return f; +} + +void +amd64_win64_emitfn(Fn *fn, FILE *f) +{ + static char *ctoa[] = { + #define X(c, s) [c] = s, + CMP(X) + #undef X + }; + Blk *b, *s; + Ins *i; + int *r, c, lbl; + uint save_gpr, save_xmm; + uint64_t fs, off, localsz; + uint64_t gpr_base, xmm_base; + + fprintf(f, ".text\n"); + if (fn->export) + fprintf(f, ".globl %s%s\n", gassym, fn->name); + fprintf(f, + "%s%s:\n" + "\tpushq %%rbp\n" + "\tmovq %%rsp, %%rbp\n", + gassym, fn->name + ); + + fs = framesz(fn, &save_gpr, &save_xmm, &localsz); + if (fs) + fprintf(f, "\tsub $%"PRIu64", %%rsp\n", fs); + + gpr_base = localsz; + off = 0; + for (r=amd64_win64_rclob; *r>=0 && *r < XMM0; r++) { + if (!(fn->reg & BIT(*r))) + continue; + fprintf(f, "\tmovq %%%s, -%"PRIu64"(%%rbp)\n", + regtoa(*r, SLong), + gpr_base + off + 8); + off += 8; + } + xmm_base = localsz + 8 * save_gpr; + off = 0; + for (; *r>=0; r++) { + if (!(fn->reg & BIT(*r))) + continue; + fprintf(f, "\tmovdqu %%%s, -%"PRIu64"(%%rbp)\n", + regtoa(*r, SLong), + xmm_base + off + 16); + off += 16; + } + + for (lbl=0, b=fn->start; b; b=b->link) { + if (lbl || b->npred > 1) + fprintf(f, "%sbb%d:\n", gasloc, b->id); + for (i=b->ins; i!=&b->ins[b->nins]; i++) + emitins(*i, fn, f); + lbl = 1; + switch (b->jmp.type) { + case Jret0: + off = xmm_base + 16 * save_xmm; + for (r=&amd64_win64_rclob[0]; *r>=0; r++) + if (*r >= XMM0 && (fn->reg & BIT(*r))) { + off -= 16; + fprintf(f, "\tmovdqu -%"PRIu64"(%%rbp), %%%s\n", + off + 16, + regtoa(*r, SLong)); + } + off = gpr_base + 8 * save_gpr; + for (r=&amd64_win64_rclob[0]; *r>=0 && *r < XMM0; r++) + if (fn->reg & BIT(*r)) { + off -= 8; + fprintf(f, "\tmovq -%"PRIu64"(%%rbp), %%%s\n", + off + 8, + regtoa(*r, SLong)); + } + if (fn->dynalloc) + fprintf(f, + "\tmovq %%rbp, %%rsp\n" + "\tsubq $%"PRIu64", %%rsp\n", + fs + ); + fprintf(f, + "\tleave\n" + "\tret\n" + ); + break; + case Jjmp: + Jmp: + if (b->s1 != b->link) + fprintf(f, "\tjmp %sbb%d\n", + gasloc, b->s1->id); + else + lbl = 0; + break; + default: + c = b->jmp.type - Jjf; + if (0 <= c && c <= NCmp) { + if (b->link == b->s2) { + s = b->s1; + b->s1 = b->s2; + b->s2 = s; + } else + c = cmpneg(c); + fprintf(f, "\tj%s %sbb%d\n", ctoa[c], + gasloc, b->s2->id); + goto Jmp; + } + die("unhandled jump %d", b->jmp.type); + } + } +} diff --git a/main.c b/main.c index 033ed9c..f7c1285 100644 --- a/main.c +++ b/main.c @@ -6,6 +6,7 @@ Target T; extern Target T_amd64_sysv; +extern Target T_amd64_win64; extern Target T_arm64; static struct TMap { @@ -13,6 +14,7 @@ static struct TMap { Target *T; } tmap[] = { { "amd64_sysv", &T_amd64_sysv }, + { "amd64_win64", &T_amd64_win64 }, { "arm64", &T_arm64 }, { 0, 0 } }; diff --git a/mem.c b/mem.c index eda3d18..4a5fcc4 100644 --- a/mem.c +++ b/mem.c @@ -25,6 +25,8 @@ memopt(Fn *fn) for (u=t->use; u != &t->use[t->nuse]; u++) { if (u->type != UIns) goto Skip; + if (u->bid != b->id) + goto Skip; l = u->u.ins; if (isload(l->op)) if (s == -1 || s == loadsz(l)) { diff --git a/rega.c b/rega.c index 2f01c07..8fec3f6 100644 --- a/rega.c +++ b/rega.c @@ -65,6 +65,17 @@ rfind(RMap *m, int t) return -1; } +static int +regused(RMap *m, int r, int skip) +{ + int i; + + for (i=0; in; i++) + if (i != skip && m->r[i] == r) + return 1; + return 0; +} + static Ref rref(RMap *m, int t) { @@ -476,6 +487,7 @@ rega(Fn *fn) int j, t, r, x, rl[Tmp0]; Blk *b, *b1, *s, ***ps, *blist, **blk, **bp; RMap *end, *beg, cur, old, *m; + BSet def[1]; Ins *i; Phi *p; uint u, n; @@ -496,6 +508,7 @@ rega(Fn *fn) } bsinit(cur.b, fn->ntmp); bsinit(old.b, fn->ntmp); + bsinit(def, fn->ntmp); loop = INT_MAX; for (t=0; tntmp; t++) { @@ -548,6 +561,13 @@ rega(Fn *fn) for (s=fn->start; s; s=s->link) { if (s->npred <= 1) continue; + bszero(def); + for (p=s->phi; p; p=p->link) + if (rtype(p->to) == RTmp) + bsset(def, p->to.val); + for (i=s->ins; i-s->ins < s->nins; i++) + if (rtype(i->to) == RTmp) + bsset(def, i->to.val); m = &beg[s->id]; /* rl maps a register that is live at the @@ -568,7 +588,11 @@ rega(Fn *fn) if (rtype(src) != RTmp) continue; x = rfind(&end[b->id], src.val); - assert(x != -1); + if (x == -1) { + fprintf(stderr, "rega missing phi tmp %d(%s) in pred %d(%s) for block %d(%s) in %s\n", + src.val, tmp[src.val].name, b->id, b->name, s->id, s->name, fn->name); + abort(); + } rl[r] = (!rl[r] || rl[r] == x) ? x : -1; } if (rl[r] == 0) @@ -579,11 +603,17 @@ rega(Fn *fn) for (j=0; jn; j++) { t = m->t[j]; r = m->r[j]; - if (rl[r] || t < Tmp0 /* todo, remove this */) + if (rl[r] || t < Tmp0 /* todo, remove this */ || bshas(def, t)) continue; for (bp=s->pred; bp<&s->pred[s->npred]; bp++) { x = rfind(&end[(*bp)->id], t); - assert(x != -1); + if (x == -1) { + if (tmp[t].slot != -1) + continue; + fprintf(stderr, "rega missing tmp %d(%s) in pred %d(%s) for block %d(%s) in %s\n", + t, tmp[t].name, (*bp)->id, (*bp)->name, s->id, s->name, fn->name); + abort(); + } rl[r] = (!rl[r] || rl[r] == x) ? x : -1; } } @@ -595,7 +625,14 @@ rega(Fn *fn) x = rl[r]; assert(x != 0 || t < Tmp0 /* todo, ditto */); if (x > 0) { + if (x != r && regused(m, x, j)) + continue; pmadd(TMP(x), TMP(r), tmp[t].cls); + if (x != r) { + if (!regused(m, r, j)) + bsclr(m->b, r); + bsset(m->b, x); + } m->r[j] = x; } }