Add luaproc and LuaJIT libraries.

Two versions of LuaJIT, the stable release, and the dev version. Try the dev version first, until ih fails badly.
author: David Walter Seikel 2012-01-23 23:36:30 +1000
committer: David Walter Seikel 2012-01-23 23:36:30 +1000
commit: 6523585c66c04cea54df50013df8886b589847d8 (patch)
tree: 0b22aee7064166d88595eda260ca2d17c0773da5 /libraries/luajit-2.0/src/lj_opt_loop.c
parent: Update the EFL to what I'm actually using, coz I'm using some stuff not yet r... (diff)
download: SledjHamr-6523585c66c04cea54df50013df8886b589847d8.zip
SledjHamr-6523585c66c04cea54df50013df8886b589847d8.tar.gz
SledjHamr-6523585c66c04cea54df50013df8886b589847d8.tar.bz2
SledjHamr-6523585c66c04cea54df50013df8886b589847d8.tar.xz
1 files changed, 429 insertions, 0 deletions
diff --git a/libraries/luajit-2.0/src/lj_opt_loop.c b/libraries/luajit-2.0/src/lj_opt_loop.c
new file mode 100644
index 0000000..460297b
--- /dev/null
+++ b/libraries/luajit-2.0/src/lj_opt_loop.c
@@ -0,0 +1,429 @@
+/*
+** LOOP: Loop Optimizations.
+** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h
+*/
+#define lj_opt_loop_c
+#define LUA_CORE
+#include "lj_obj.h"
+#if LJ_HASJIT
+#include "lj_err.h"
+#include "lj_str.h"
+#include "lj_ir.h"
+#include "lj_jit.h"
+#include "lj_iropt.h"
+#include "lj_trace.h"
+#include "lj_snap.h"
+#include "lj_vm.h"
+/* Loop optimization:
+**
+** Traditional Loop-Invariant Code Motion (LICM) splits the instructions
+** of a loop into invariant and variant instructions. The invariant
+** instructions are hoisted out of the loop and only the variant
+** instructions remain inside the loop body.
+**
+** Unfortunately LICM is mostly useless for compiling dynamic languages.
+** The IR has many guards and most of the subsequent instructions are
+** control-dependent on them. The first non-hoistable guard would
+** effectively prevent hoisting of all subsequent instructions.
+**
+** That's why we use a special form of unrolling using copy-substitution,
+** combined with redundancy elimination:
+**
+** The recorded instruction stream is re-emitted to the compiler pipeline
+** with substituted operands. The substitution table is filled with the
+** refs returned by re-emitting each instruction. This can be done
+** on-the-fly, because the IR is in strict SSA form, where every ref is
+** defined before its use.
+**
+** This aproach generates two code sections, separated by the LOOP
+** instruction:
+**
+** 1. The recorded instructions form a kind of pre-roll for the loop. It
+** contains a mix of invariant and variant instructions and performs
+** exactly one loop iteration (but not necessarily the 1st iteration).
+**
+** 2. The loop body contains only the variant instructions and performs
+** all remaining loop iterations.
+**
+** On first sight that looks like a waste of space, because the variant
+** instructions are present twice. But the key insight is that the
+** pre-roll honors the control-dependencies for *both* the pre-roll itself
+** *and* the loop body!
+**
+** It also means one doesn't have to explicitly model control-dependencies
+** (which, BTW, wouldn't help LICM much). And it's much easier to
+** integrate sparse snapshotting with this approach.
+**
+** One of the nicest aspects of this approach is that all of the
+** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be
+** reused with only minor restrictions (e.g. one should not fold
+** instructions across loop-carried dependencies).
+**
+** But in general all optimizations can be applied which only need to look
+** backwards into the generated instruction stream. At any point in time
+** during the copy-substitution process this contains both a static loop
+** iteration (the pre-roll) and a dynamic one (from the to-be-copied
+** instruction up to the end of the partial loop body).
+**
+** Since control-dependencies are implicitly kept, CSE also applies to all
+** kinds of guards. The major advantage is that all invariant guards can
+** be hoisted, too.
+**
+** Load/store forwarding works across loop iterations, too. This is
+** important if loop-carried dependencies are kept in upvalues or tables.
+** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may
+** become a forwarded loop-recurrence after inlining.
+**
+** Since the IR is in SSA form, loop-carried dependencies have to be
+** modeled with PHI instructions. The potential candidates for PHIs are
+** collected on-the-fly during copy-substitution. After eliminating the
+** redundant ones, PHI instructions are emitted *below* the loop body.
+**
+** Note that this departure from traditional SSA form doesn't change the
+** semantics of the PHI instructions themselves. But it greatly simplifies
+** on-the-fly generation of the IR and the machine code.
+*/
+/* Some local macros to save typing. Undef'd at the end. */
+#define IR(ref)         (&J->cur.ir[(ref)])
+/* Pass IR on to next optimization in chain (FOLD). */
+#define emitir(ot, a, b)        (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
+/* Emit raw IR without passing through optimizations. */
+#define emitir_raw(ot, a, b)    (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
+/* -- PHI elimination ----------------------------------------------------- */
+/* Emit or eliminate collected PHIs. */
+static void loop_emit_phi(jit_State *J, IRRef1 *subst, IRRef1 *phi, IRRef nphi,
+                          SnapNo onsnap)
+{
+  int passx = 0;
+  IRRef i, nslots;
+  IRRef invar = J->chain[IR_LOOP];
+  /* Pass #1: mark redundant and potentially redundant PHIs. */
+  for (i = 0; i < nphi; i++) {
+    IRRef lref = phi[i];
+    IRRef rref = subst[lref];
+    if (lref == rref || rref == REF_DROP) {  /* Invariants are redundant. */
+      irt_setmark(IR(lref)->t);
+    } else if (!(IR(rref)->op1 == lref || IR(rref)->op2 == lref)) {
+      /* Quick check for simple recurrences failed, need pass2. */
+      irt_setmark(IR(lref)->t);
+      passx = 1;
+    }
+  }
+  /* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */
+  if (passx) {
+    SnapNo s;
+    for (i = J->cur.nins-1; i > invar; i--) {
+      IRIns *ir = IR(i);
+      if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
+      if (!irref_isk(ir->op1)) {
+        irt_clearmark(IR(ir->op1)->t);
+        if (ir->op1 < invar &&
+            ir->o >= IR_CALLN && ir->o <= IR_CARG) {  /* ORDER IR */
+          ir = IR(ir->op1);
+          while (ir->o == IR_CARG) {
+            if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
+            if (irref_isk(ir->op1)) break;
+            ir = IR(ir->op1);
+            irt_clearmark(ir->t);
+          }
+        }
+      }
+    }
+    for (s = J->cur.nsnap-1; s >= onsnap; s--) {
+      SnapShot *snap = &J->cur.snap[s];
+      SnapEntry *map = &J->cur.snapmap[snap->mapofs];
+      MSize n, nent = snap->nent;
+      for (n = 0; n < nent; n++) {
+        IRRef ref = snap_ref(map[n]);
+        if (!irref_isk(ref)) irt_clearmark(IR(ref)->t);
+      }
+    }
+  }
+  /* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */
+  nslots = J->baseslot+J->maxslot;
+  for (i = 1; i < nslots; i++) {
+    IRRef ref = tref_ref(J->slot[i]);
+    while (!irref_isk(ref) && ref != subst[ref]) {
+      IRIns *ir = IR(ref);
+      irt_clearmark(ir->t);  /* Unmark potential uses, too. */
+      if (irt_isphi(ir->t) || irt_ispri(ir->t))
+        break;
+      irt_setphi(ir->t);
+      if (nphi >= LJ_MAX_PHI)
+        lj_trace_err(J, LJ_TRERR_PHIOV);
+      phi[nphi++] = (IRRef1)ref;
+      ref = subst[ref];
+      if (ref > invar)
+        break;
+    }
+  }
+  /* Pass #4: propagate non-redundant PHIs. */
+  while (passx) {
+    passx = 0;
+    for (i = 0; i < nphi; i++) {
+      IRRef lref = phi[i];
+      IRIns *ir = IR(lref);
+      if (!irt_ismarked(ir->t)) {  /* Propagate only from unmarked PHIs. */
+        IRRef rref = subst[lref];
+        if (lref == rref) {  /* Mark redundant PHI. */
+          irt_setmark(ir->t);
+        } else {
+          IRIns *irr = IR(rref);
+          if (irt_ismarked(irr->t)) {  /* Right ref points to other PHI? */
+            irt_clearmark(irr->t);  /* Mark that PHI as non-redundant. */
+            passx = 1;  /* Retry. */
+          }
+        }
+      }
+    }
+  }
+  /* Pass #5: emit PHI instructions or eliminate PHIs. */
+  for (i = 0; i < nphi; i++) {
+    IRRef lref = phi[i];
+    IRIns *ir = IR(lref);
+    if (!irt_ismarked(ir->t)) {  /* Emit PHI if not marked. */
+      IRRef rref = subst[lref];
+      if (rref > invar)
+        irt_setphi(IR(rref)->t);
+      emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref);
+    } else {  /* Otherwise eliminate PHI. */
+      irt_clearmark(ir->t);
+      irt_clearphi(ir->t);
+    }
+  }
+}
+/* -- Loop unrolling using copy-substitution ------------------------------ */
+/* Copy-substitute snapshot. */
+static void loop_subst_snap(jit_State *J, SnapShot *osnap,
+                            SnapEntry *loopmap, IRRef1 *subst)
+{
+  SnapEntry *nmap, *omap = &J->cur.snapmap[osnap->mapofs];
+  SnapEntry *nextmap = &J->cur.snapmap[snap_nextofs(&J->cur, osnap)];
+  MSize nmapofs;
+  MSize on, ln, nn, onent = osnap->nent;
+  BCReg nslots = osnap->nslots;
+  SnapShot *snap = &J->cur.snap[J->cur.nsnap];
+  if (irt_isguard(J->guardemit)) {  /* Guard inbetween? */
+    nmapofs = J->cur.nsnapmap;
+    J->cur.nsnap++;  /* Add new snapshot. */
+  } else {  /* Otherwise overwrite previous snapshot. */
+    snap--;
+    nmapofs = snap->mapofs;
+  }
+  J->guardemit.irt = 0;
+  /* Setup new snapshot. */
+  snap->mapofs = (uint16_t)nmapofs;
+  snap->ref = (IRRef1)J->cur.nins;
+  snap->nslots = nslots;
+  snap->topslot = osnap->topslot;
+  snap->count = 0;
+  nmap = &J->cur.snapmap[nmapofs];
+  /* Substitute snapshot slots. */
+  on = ln = nn = 0;
+  while (on < onent) {
+    SnapEntry osn = omap[on], lsn = loopmap[ln];
+    if (snap_slot(lsn) < snap_slot(osn)) {  /* Copy slot from loop map. */
+      nmap[nn++] = lsn;
+      ln++;
+    } else {  /* Copy substituted slot from snapshot map. */
+      if (snap_slot(lsn) == snap_slot(osn)) ln++;  /* Shadowed loop slot. */
+      if (!irref_isk(snap_ref(osn)))
+        osn = snap_setref(osn, subst[snap_ref(osn)]);
+      nmap[nn++] = osn;
+      on++;
+    }
+  }
+  while (snap_slot(loopmap[ln]) < nslots)  /* Copy remaining loop slots. */
+    nmap[nn++] = loopmap[ln++];
+  snap->nent = (uint8_t)nn;
+  omap += onent;
+  nmap += nn;
+  while (omap < nextmap)  /* Copy PC + frame links. */
+    *nmap++ = *omap++;
+  J->cur.nsnapmap = (uint16_t)(nmap - J->cur.snapmap);
+}
+/* Unroll loop. */
+static void loop_unroll(jit_State *J)
+{
+  IRRef1 phi[LJ_MAX_PHI];
+  uint32_t nphi = 0;
+  IRRef1 *subst;
+  SnapNo onsnap;
+  SnapShot *osnap, *loopsnap;
+  SnapEntry *loopmap, *psentinel;
+  IRRef ins, invar;
+  /* Use temp buffer for substitution table.
+  ** Only non-constant refs in [REF_BIAS,invar) are valid indexes.
+  ** Caveat: don't call into the VM or run the GC or the buffer may be gone.
+  */
+  invar = J->cur.nins;
+  subst = (IRRef1 *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf,
+                                   (invar-REF_BIAS)*sizeof(IRRef1)) - REF_BIAS;
+  subst[REF_BASE] = REF_BASE;
+  /* LOOP separates the pre-roll from the loop body. */
+  emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0);
+  /* Grow snapshot buffer and map for copy-substituted snapshots.
+  ** Need up to twice the number of snapshots minus #0 and loop snapshot.
+  ** Need up to twice the number of entries plus fallback substitutions
+  ** from the loop snapshot entries for each new snapshot.
+  ** Caveat: both calls may reallocate J->cur.snap and J->cur.snapmap!
+  */
+  onsnap = J->cur.nsnap;
+  lj_snap_grow_buf(J, 2*onsnap-2);
+  lj_snap_grow_map(J, J->cur.nsnapmap*2+(onsnap-2)*J->cur.snap[onsnap-1].nent);
+  /* The loop snapshot is used for fallback substitutions. */
+  loopsnap = &J->cur.snap[onsnap-1];
+  loopmap = &J->cur.snapmap[loopsnap->mapofs];
+  /* The PC of snapshot #0 and the loop snapshot must match. */
+  psentinel = &loopmap[loopsnap->nent];
+  lua_assert(*psentinel == J->cur.snapmap[J->cur.snap[0].nent]);
+  *psentinel = SNAP(255, 0, 0);  /* Replace PC with temporary sentinel. */
+  /* Start substitution with snapshot #1 (#0 is empty for root traces). */
+  osnap = &J->cur.snap[1];
+  /* Copy and substitute all recorded instructions and snapshots. */
+  for (ins = REF_FIRST; ins < invar; ins++) {
+    IRIns *ir;
+    IRRef op1, op2;
+    if (ins >= osnap->ref)  /* Instruction belongs to next snapshot? */
+      loop_subst_snap(J, osnap++, loopmap, subst);  /* Copy-substitute it. */
+    /* Substitute instruction operands. */
+    ir = IR(ins);
+    op1 = ir->op1;
+    if (!irref_isk(op1)) op1 = subst[op1];
+    op2 = ir->op2;
+    if (!irref_isk(op2)) op2 = subst[op2];
+    if (irm_kind(lj_ir_mode[ir->o]) == IRM_N &&
+        op1 == ir->op1 && op2 == ir->op2) {  /* Regular invariant ins? */
+      subst[ins] = (IRRef1)ins;  /* Shortcut. */
+    } else {
+      /* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */
+      IRType1 t = ir->t;  /* Get this first, since emitir may invalidate ir. */
+      IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2));
+      subst[ins] = (IRRef1)ref;
+      if (ref != ins && ref < invar) {  /* Loop-carried dependency? */
+        IRIns *irr = IR(ref);
+        /* Potential PHI? */
+        if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) {
+          irt_setphi(irr->t);
+          if (nphi >= LJ_MAX_PHI)
+            lj_trace_err(J, LJ_TRERR_PHIOV);
+          phi[nphi++] = (IRRef1)ref;
+        }
+        /* Check all loop-carried dependencies for type instability. */
+        if (!irt_sametype(t, irr->t)) {
+          if (irt_isinteger(t) && irt_isinteger(irr->t))
+            continue;
+          else if (irt_isnum(t) && irt_isinteger(irr->t))  /* Fix int->num. */
+            ref = tref_ref(emitir(IRTN(IR_CONV), ref, IRCONV_NUM_INT));
+          else if (irt_isnum(irr->t) && irt_isinteger(t))  /* Fix num->int. */
+            ref = tref_ref(emitir(IRTGI(IR_CONV), ref,
+                                  IRCONV_INT_NUM|IRCONV_CHECK));
+          else
+            lj_trace_err(J, LJ_TRERR_TYPEINS);
+          subst[ins] = (IRRef1)ref;
+          /* May need a PHI for the CONV, too. */
+          irr = IR(ref);
+          if (ref < invar && !irref_isk(ref) && !irt_isphi(irr->t)) {
+            irt_setphi(irr->t);
+            if (nphi >= LJ_MAX_PHI)
+              lj_trace_err(J, LJ_TRERR_PHIOV);
+            phi[nphi++] = (IRRef1)ref;
+          }
+        }
+      }
+    }
+  }
+  if (!irt_isguard(J->guardemit))  /* Drop redundant snapshot. */
+    J->cur.nsnapmap = (uint16_t)J->cur.snap[--J->cur.nsnap].mapofs;
+  lua_assert(J->cur.nsnapmap <= J->sizesnapmap);
+  *psentinel = J->cur.snapmap[J->cur.snap[0].nent];  /* Restore PC. */
+  loop_emit_phi(J, subst, phi, nphi, onsnap);
+}
+/* Undo any partial changes made by the loop optimization. */
+static void loop_undo(jit_State *J, IRRef ins, SnapNo nsnap, MSize nsnapmap)
+{
+  ptrdiff_t i;
+  SnapShot *snap = &J->cur.snap[nsnap-1];
+  SnapEntry *map = J->cur.snapmap;
+  map[snap->mapofs + snap->nent] = map[J->cur.snap[0].nent];  /* Restore PC. */
+  J->cur.nsnapmap = (uint16_t)nsnapmap;
+  J->cur.nsnap = nsnap;
+  J->guardemit.irt = 0;
+  lj_ir_rollback(J, ins);
+  for (i = 0; i < BPROP_SLOTS; i++) {  /* Remove backprop. cache entries. */
+    BPropEntry *bp = &J->bpropcache[i];
+    if (bp->val >= ins)
+      bp->key = 0;
+  }
+  for (ins--; ins >= REF_FIRST; ins--) {  /* Remove flags. */
+    IRIns *ir = IR(ins);
+    irt_clearphi(ir->t);
+    irt_clearmark(ir->t);
+  }
+}
+/* Protected callback for loop optimization. */
+static TValue *cploop_opt(lua_State *L, lua_CFunction dummy, void *ud)
+{
+  UNUSED(L); UNUSED(dummy);
+  loop_unroll((jit_State *)ud);
+  return NULL;
+}
+/* Loop optimization. */
+int lj_opt_loop(jit_State *J)
+{
+  IRRef nins = J->cur.nins;
+  SnapNo nsnap = J->cur.nsnap;
+  MSize nsnapmap = J->cur.nsnapmap;
+  int errcode = lj_vm_cpcall(J->L, NULL, J, cploop_opt);
+  if (LJ_UNLIKELY(errcode)) {
+    lua_State *L = J->L;
+    if (errcode == LUA_ERRRUN && tvisnumber(L->top-1)) {  /* Trace error? */
+      int32_t e = numberVint(L->top-1);
+      switch ((TraceError)e) {
+      case LJ_TRERR_TYPEINS:  /* Type instability. */
+      case LJ_TRERR_GFAIL:  /* Guard would always fail. */
+        /* Unrolling via recording fixes many cases, e.g. a flipped boolean. */
+        if (--J->instunroll < 0)  /* But do not unroll forever. */
+          break;
+        L->top--;  /* Remove error object. */
+        loop_undo(J, nins, nsnap, nsnapmap);
+        return 1;  /* Loop optimization failed, continue recording. */
+      default:
+        break;
+      }
+    }
+    lj_err_throw(L, errcode);  /* Propagate all other errors. */
+  }
+  return 0;  /* Loop optimization is ok. */
+}
+#undef IR
+#undef emitir
+#undef emitir_raw
+#endif
author	David Walter Seikel	2012-01-23 23:36:30 +1000
committer	David Walter Seikel	2012-01-23 23:36:30 +1000
commit	6523585c66c04cea54df50013df8886b589847d8 (patch)
tree	0b22aee7064166d88595eda260ca2d17c0773da5 /libraries/luajit-2.0/src/lj_opt_loop.c
parent	Update the EFL to what I'm actually using, coz I'm using some stuff not yet r... (diff)
download	SledjHamr-6523585c66c04cea54df50013df8886b589847d8.zip SledjHamr-6523585c66c04cea54df50013df8886b589847d8.tar.gz SledjHamr-6523585c66c04cea54df50013df8886b589847d8.tar.bz2 SledjHamr-6523585c66c04cea54df50013df8886b589847d8.tar.xz

diff --git a/libraries/luajit-2.0/src/lj_opt_loop.c b/libraries/luajit-2.0/src/lj_opt_loop.c new file mode 100644 index 0000000..460297b --- /dev/null +++ b/libraries/luajit-2.0/src/lj_opt_loop.c
@@ -0,0 +1,429 @@
	1	/*
	2	** LOOP: Loop Optimizations.
	3	** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h
	4	*/
	5
	6	#define lj_opt_loop_c
	7	#define LUA_CORE
	8
	9	#include "lj_obj.h"
	10
	11	#if LJ_HASJIT
	12
	13	#include "lj_err.h"
	14	#include "lj_str.h"
	15	#include "lj_ir.h"
	16	#include "lj_jit.h"
	17	#include "lj_iropt.h"
	18	#include "lj_trace.h"
	19	#include "lj_snap.h"
	20	#include "lj_vm.h"
	21
	22	/* Loop optimization:
	23	**
	24	** Traditional Loop-Invariant Code Motion (LICM) splits the instructions
	25	** of a loop into invariant and variant instructions. The invariant
	26	** instructions are hoisted out of the loop and only the variant
	27	** instructions remain inside the loop body.
	28	**
	29	** Unfortunately LICM is mostly useless for compiling dynamic languages.
	30	** The IR has many guards and most of the subsequent instructions are
	31	** control-dependent on them. The first non-hoistable guard would
	32	** effectively prevent hoisting of all subsequent instructions.
	33	**
	34	** That's why we use a special form of unrolling using copy-substitution,
	35	** combined with redundancy elimination:
	36	**
	37	** The recorded instruction stream is re-emitted to the compiler pipeline
	38	** with substituted operands. The substitution table is filled with the
	39	** refs returned by re-emitting each instruction. This can be done
	40	** on-the-fly, because the IR is in strict SSA form, where every ref is
	41	** defined before its use.
	42	**
	43	** This aproach generates two code sections, separated by the LOOP
	44	** instruction:
	45	**
	46	** 1. The recorded instructions form a kind of pre-roll for the loop. It
	47	** contains a mix of invariant and variant instructions and performs
	48	** exactly one loop iteration (but not necessarily the 1st iteration).
	49	**
	50	** 2. The loop body contains only the variant instructions and performs
	51	** all remaining loop iterations.
	52	**
	53	** On first sight that looks like a waste of space, because the variant
	54	** instructions are present twice. But the key insight is that the
	55	** pre-roll honors the control-dependencies for both the pre-roll itself
	56	** and the loop body!
	57	**
	58	** It also means one doesn't have to explicitly model control-dependencies
	59	** (which, BTW, wouldn't help LICM much). And it's much easier to
	60	** integrate sparse snapshotting with this approach.
	61	**
	62	** One of the nicest aspects of this approach is that all of the
	63	** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be
	64	** reused with only minor restrictions (e.g. one should not fold
	65	** instructions across loop-carried dependencies).
	66	**
	67	** But in general all optimizations can be applied which only need to look
	68	** backwards into the generated instruction stream. At any point in time
	69	** during the copy-substitution process this contains both a static loop
	70	** iteration (the pre-roll) and a dynamic one (from the to-be-copied
	71	** instruction up to the end of the partial loop body).
	72	**
	73	** Since control-dependencies are implicitly kept, CSE also applies to all
	74	** kinds of guards. The major advantage is that all invariant guards can
	75	** be hoisted, too.
	76	**
	77	** Load/store forwarding works across loop iterations, too. This is
	78	** important if loop-carried dependencies are kept in upvalues or tables.
	79	** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may
	80	** become a forwarded loop-recurrence after inlining.
	81	**
	82	** Since the IR is in SSA form, loop-carried dependencies have to be
	83	** modeled with PHI instructions. The potential candidates for PHIs are
	84	** collected on-the-fly during copy-substitution. After eliminating the
	85	** redundant ones, PHI instructions are emitted below the loop body.
	86	**
	87	** Note that this departure from traditional SSA form doesn't change the
	88	** semantics of the PHI instructions themselves. But it greatly simplifies
	89	** on-the-fly generation of the IR and the machine code.
	90	*/
	91
	92	/* Some local macros to save typing. Undef'd at the end. */
	93	#define IR(ref) (&J->cur.ir[(ref)])
	94
	95	/* Pass IR on to next optimization in chain (FOLD). */
	96	#define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
	97
	98	/* Emit raw IR without passing through optimizations. */
	99	#define emitir_raw(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
	100
	101	/* -- PHI elimination ----------------------------------------------------- */
	102
	103	/* Emit or eliminate collected PHIs. */
	104	static void loop_emit_phi(jit_State J, IRRef1 subst, IRRef1 *phi, IRRef nphi,
	105	SnapNo onsnap)
	106	{
	107	int passx = 0;
	108	IRRef i, nslots;
	109	IRRef invar = J->chain[IR_LOOP];
	110	/* Pass #1: mark redundant and potentially redundant PHIs. */
	111	for (i = 0; i < nphi; i++) {
	112	IRRef lref = phi[i];
	113	IRRef rref = subst[lref];
	114	if (lref == rref \|\| rref == REF_DROP) { /* Invariants are redundant. */
	115	irt_setmark(IR(lref)->t);
	116	} else if (!(IR(rref)->op1 == lref \|\| IR(rref)->op2 == lref)) {
	117	/* Quick check for simple recurrences failed, need pass2. */
	118	irt_setmark(IR(lref)->t);
	119	passx = 1;
	120	}
	121	}
	122	/* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */
	123	if (passx) {
	124	SnapNo s;
	125	for (i = J->cur.nins-1; i > invar; i--) {
	126	IRIns *ir = IR(i);
	127	if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
	128	if (!irref_isk(ir->op1)) {
	129	irt_clearmark(IR(ir->op1)->t);
	130	if (ir->op1 < invar &&
	131	ir->o >= IR_CALLN && ir->o <= IR_CARG) { /* ORDER IR */
	132	ir = IR(ir->op1);
	133	while (ir->o == IR_CARG) {
	134	if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
	135	if (irref_isk(ir->op1)) break;
	136	ir = IR(ir->op1);
	137	irt_clearmark(ir->t);
	138	}
	139	}
	140	}
	141	}
	142	for (s = J->cur.nsnap-1; s >= onsnap; s--) {
	143	SnapShot *snap = &J->cur.snap[s];
	144	SnapEntry *map = &J->cur.snapmap[snap->mapofs];
	145	MSize n, nent = snap->nent;
	146	for (n = 0; n < nent; n++) {
	147	IRRef ref = snap_ref(map[n]);
	148	if (!irref_isk(ref)) irt_clearmark(IR(ref)->t);
	149	}
	150	}
	151	}
	152	/* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */
	153	nslots = J->baseslot+J->maxslot;
	154	for (i = 1; i < nslots; i++) {
	155	IRRef ref = tref_ref(J->slot[i]);
	156	while (!irref_isk(ref) && ref != subst[ref]) {
	157	IRIns *ir = IR(ref);
	158	irt_clearmark(ir->t); /* Unmark potential uses, too. */
	159	if (irt_isphi(ir->t) \|\| irt_ispri(ir->t))
	160	break;
	161	irt_setphi(ir->t);
	162	if (nphi >= LJ_MAX_PHI)
	163	lj_trace_err(J, LJ_TRERR_PHIOV);
	164	phi[nphi++] = (IRRef1)ref;
	165	ref = subst[ref];
	166	if (ref > invar)
	167	break;
	168	}
	169	}
	170	/* Pass #4: propagate non-redundant PHIs. */
	171	while (passx) {
	172	passx = 0;
	173	for (i = 0; i < nphi; i++) {
	174	IRRef lref = phi[i];
	175	IRIns *ir = IR(lref);
	176	if (!irt_ismarked(ir->t)) { /* Propagate only from unmarked PHIs. */
	177	IRRef rref = subst[lref];
	178	if (lref == rref) { /* Mark redundant PHI. */
	179	irt_setmark(ir->t);
	180	} else {
	181	IRIns *irr = IR(rref);
	182	if (irt_ismarked(irr->t)) { /* Right ref points to other PHI? */
	183	irt_clearmark(irr->t); /* Mark that PHI as non-redundant. */
	184	passx = 1; /* Retry. */
	185	}
	186	}
	187	}
	188	}
	189	}
	190	/* Pass #5: emit PHI instructions or eliminate PHIs. */
	191	for (i = 0; i < nphi; i++) {
	192	IRRef lref = phi[i];
	193	IRIns *ir = IR(lref);
	194	if (!irt_ismarked(ir->t)) { /* Emit PHI if not marked. */
	195	IRRef rref = subst[lref];
	196	if (rref > invar)
	197	irt_setphi(IR(rref)->t);
	198	emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref);
	199	} else { /* Otherwise eliminate PHI. */
	200	irt_clearmark(ir->t);
	201	irt_clearphi(ir->t);
	202	}
	203	}
	204	}
	205
	206	/* -- Loop unrolling using copy-substitution ------------------------------ */
	207
	208	/* Copy-substitute snapshot. */
	209	static void loop_subst_snap(jit_State J, SnapShot osnap,
	210	SnapEntry loopmap, IRRef1 subst)
	211	{
	212	SnapEntry nmap, omap = &J->cur.snapmap[osnap->mapofs];
	213	SnapEntry *nextmap = &J->cur.snapmap[snap_nextofs(&J->cur, osnap)];
	214	MSize nmapofs;
	215	MSize on, ln, nn, onent = osnap->nent;
	216	BCReg nslots = osnap->nslots;
	217	SnapShot *snap = &J->cur.snap[J->cur.nsnap];
	218	if (irt_isguard(J->guardemit)) { /* Guard inbetween? */
	219	nmapofs = J->cur.nsnapmap;
	220	J->cur.nsnap++; /* Add new snapshot. */
	221	} else { /* Otherwise overwrite previous snapshot. */
	222	snap--;
	223	nmapofs = snap->mapofs;
	224	}
	225	J->guardemit.irt = 0;
	226	/* Setup new snapshot. */
	227	snap->mapofs = (uint16_t)nmapofs;
	228	snap->ref = (IRRef1)J->cur.nins;
	229	snap->nslots = nslots;
	230	snap->topslot = osnap->topslot;
	231	snap->count = 0;
	232	nmap = &J->cur.snapmap[nmapofs];
	233	/* Substitute snapshot slots. */
	234	on = ln = nn = 0;
	235	while (on < onent) {
	236	SnapEntry osn = omap[on], lsn = loopmap[ln];
	237	if (snap_slot(lsn) < snap_slot(osn)) { /* Copy slot from loop map. */
	238	nmap[nn++] = lsn;
	239	ln++;
	240	} else { /* Copy substituted slot from snapshot map. */
	241	if (snap_slot(lsn) == snap_slot(osn)) ln++; /* Shadowed loop slot. */
	242	if (!irref_isk(snap_ref(osn)))
	243	osn = snap_setref(osn, subst[snap_ref(osn)]);
	244	nmap[nn++] = osn;
	245	on++;
	246	}
	247	}
	248	while (snap_slot(loopmap[ln]) < nslots) /* Copy remaining loop slots. */
	249	nmap[nn++] = loopmap[ln++];
	250	snap->nent = (uint8_t)nn;
	251	omap += onent;
	252	nmap += nn;
	253	while (omap < nextmap) /* Copy PC + frame links. */
	254	nmap++ = omap++;
	255	J->cur.nsnapmap = (uint16_t)(nmap - J->cur.snapmap);
	256	}
	257
	258	/* Unroll loop. */
	259	static void loop_unroll(jit_State *J)
	260	{
	261	IRRef1 phi[LJ_MAX_PHI];
	262	uint32_t nphi = 0;
	263	IRRef1 *subst;
	264	SnapNo onsnap;
	265	SnapShot osnap, loopsnap;
	266	SnapEntry loopmap, psentinel;
	267	IRRef ins, invar;
	268
	269	/* Use temp buffer for substitution table.
	270	** Only non-constant refs in [REF_BIAS,invar) are valid indexes.
	271	** Caveat: don't call into the VM or run the GC or the buffer may be gone.
	272	*/
	273	invar = J->cur.nins;
	274	subst = (IRRef1 *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf,
	275	(invar-REF_BIAS)*sizeof(IRRef1)) - REF_BIAS;
	276	subst[REF_BASE] = REF_BASE;
	277
	278	/* LOOP separates the pre-roll from the loop body. */
	279	emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0);
	280
	281	/* Grow snapshot buffer and map for copy-substituted snapshots.
	282	** Need up to twice the number of snapshots minus #0 and loop snapshot.
	283	** Need up to twice the number of entries plus fallback substitutions
	284	** from the loop snapshot entries for each new snapshot.
	285	** Caveat: both calls may reallocate J->cur.snap and J->cur.snapmap!
	286	*/
	287	onsnap = J->cur.nsnap;
	288	lj_snap_grow_buf(J, 2*onsnap-2);
	289	lj_snap_grow_map(J, J->cur.nsnapmap2+(onsnap-2)J->cur.snap[onsnap-1].nent);
	290
	291	/* The loop snapshot is used for fallback substitutions. */
	292	loopsnap = &J->cur.snap[onsnap-1];
	293	loopmap = &J->cur.snapmap[loopsnap->mapofs];
	294	/* The PC of snapshot #0 and the loop snapshot must match. */
	295	psentinel = &loopmap[loopsnap->nent];
	296	lua_assert(*psentinel == J->cur.snapmap[J->cur.snap[0].nent]);
	297	psentinel = SNAP(255, 0, 0); / Replace PC with temporary sentinel. */
	298
	299	/* Start substitution with snapshot #1 (#0 is empty for root traces). */
	300	osnap = &J->cur.snap[1];
	301
	302	/* Copy and substitute all recorded instructions and snapshots. */
	303	for (ins = REF_FIRST; ins < invar; ins++) {
	304	IRIns *ir;
	305	IRRef op1, op2;
	306
	307	if (ins >= osnap->ref) /* Instruction belongs to next snapshot? */
	308	loop_subst_snap(J, osnap++, loopmap, subst); /* Copy-substitute it. */
	309
	310	/* Substitute instruction operands. */
	311	ir = IR(ins);
	312	op1 = ir->op1;
	313	if (!irref_isk(op1)) op1 = subst[op1];
	314	op2 = ir->op2;
	315	if (!irref_isk(op2)) op2 = subst[op2];
	316	if (irm_kind(lj_ir_mode[ir->o]) == IRM_N &&
	317	op1 == ir->op1 && op2 == ir->op2) { /* Regular invariant ins? */
	318	subst[ins] = (IRRef1)ins; /* Shortcut. */
	319	} else {
	320	/* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */
	321	IRType1 t = ir->t; /* Get this first, since emitir may invalidate ir. */
	322	IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2));
	323	subst[ins] = (IRRef1)ref;
	324	if (ref != ins && ref < invar) { /* Loop-carried dependency? */
	325	IRIns *irr = IR(ref);
	326	/* Potential PHI? */
	327	if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) {
	328	irt_setphi(irr->t);
	329	if (nphi >= LJ_MAX_PHI)
	330	lj_trace_err(J, LJ_TRERR_PHIOV);
	331	phi[nphi++] = (IRRef1)ref;
	332	}
	333	/* Check all loop-carried dependencies for type instability. */
	334	if (!irt_sametype(t, irr->t)) {
	335	if (irt_isinteger(t) && irt_isinteger(irr->t))
	336	continue;
	337	else if (irt_isnum(t) && irt_isinteger(irr->t)) /* Fix int->num. */
	338	ref = tref_ref(emitir(IRTN(IR_CONV), ref, IRCONV_NUM_INT));
	339	else if (irt_isnum(irr->t) && irt_isinteger(t)) /* Fix num->int. */
	340	ref = tref_ref(emitir(IRTGI(IR_CONV), ref,
	341	IRCONV_INT_NUM\|IRCONV_CHECK));
	342	else
	343	lj_trace_err(J, LJ_TRERR_TYPEINS);
	344	subst[ins] = (IRRef1)ref;
	345	/* May need a PHI for the CONV, too. */
	346	irr = IR(ref);
	347	if (ref < invar && !irref_isk(ref) && !irt_isphi(irr->t)) {
	348	irt_setphi(irr->t);
	349	if (nphi >= LJ_MAX_PHI)
	350	lj_trace_err(J, LJ_TRERR_PHIOV);
	351	phi[nphi++] = (IRRef1)ref;
	352	}
	353	}
	354	}
	355	}
	356	}
	357	if (!irt_isguard(J->guardemit)) /* Drop redundant snapshot. */
	358	J->cur.nsnapmap = (uint16_t)J->cur.snap[--J->cur.nsnap].mapofs;
	359	lua_assert(J->cur.nsnapmap <= J->sizesnapmap);
	360	psentinel = J->cur.snapmap[J->cur.snap[0].nent]; / Restore PC. */
	361
	362	loop_emit_phi(J, subst, phi, nphi, onsnap);
	363	}
	364
	365	/* Undo any partial changes made by the loop optimization. */
	366	static void loop_undo(jit_State *J, IRRef ins, SnapNo nsnap, MSize nsnapmap)
	367	{
	368	ptrdiff_t i;
	369	SnapShot *snap = &J->cur.snap[nsnap-1];
	370	SnapEntry *map = J->cur.snapmap;
	371	map[snap->mapofs + snap->nent] = map[J->cur.snap[0].nent]; /* Restore PC. */
	372	J->cur.nsnapmap = (uint16_t)nsnapmap;
	373	J->cur.nsnap = nsnap;
	374	J->guardemit.irt = 0;
	375	lj_ir_rollback(J, ins);
	376	for (i = 0; i < BPROP_SLOTS; i++) { /* Remove backprop. cache entries. */
	377	BPropEntry *bp = &J->bpropcache[i];
	378	if (bp->val >= ins)
	379	bp->key = 0;
	380	}
	381	for (ins--; ins >= REF_FIRST; ins--) { /* Remove flags. */
	382	IRIns *ir = IR(ins);
	383	irt_clearphi(ir->t);
	384	irt_clearmark(ir->t);
	385	}
	386	}
	387
	388	/* Protected callback for loop optimization. */
	389	static TValue cploop_opt(lua_State L, lua_CFunction dummy, void *ud)
	390	{
	391	UNUSED(L); UNUSED(dummy);
	392	loop_unroll((jit_State *)ud);
	393	return NULL;
	394	}
	395
	396	/* Loop optimization. */
	397	int lj_opt_loop(jit_State *J)
	398	{
	399	IRRef nins = J->cur.nins;
	400	SnapNo nsnap = J->cur.nsnap;
	401	MSize nsnapmap = J->cur.nsnapmap;
	402	int errcode = lj_vm_cpcall(J->L, NULL, J, cploop_opt);
	403	if (LJ_UNLIKELY(errcode)) {
	404	lua_State *L = J->L;
	405	if (errcode == LUA_ERRRUN && tvisnumber(L->top-1)) { /* Trace error? */
	406	int32_t e = numberVint(L->top-1);
	407	switch ((TraceError)e) {
	408	case LJ_TRERR_TYPEINS: /* Type instability. */
	409	case LJ_TRERR_GFAIL: /* Guard would always fail. */
	410	/* Unrolling via recording fixes many cases, e.g. a flipped boolean. */
	411	if (--J->instunroll < 0) /* But do not unroll forever. */
	412	break;
	413	L->top--; /* Remove error object. */
	414	loop_undo(J, nins, nsnap, nsnapmap);
	415	return 1; /* Loop optimization failed, continue recording. */
	416	default:
	417	break;
	418	}
	419	}
	420	lj_err_throw(L, errcode); /* Propagate all other errors. */
	421	}
	422	return 0; /* Loop optimization is ok. */
	423	}
	424
	425	#undef IR
	426	#undef emitir
	427	#undef emitir_raw
	428
	429	#endif