aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/libraries/luajit-2.0/src/lj_opt_loop.c
diff options
context:
space:
mode:
Diffstat (limited to 'libraries/luajit-2.0/src/lj_opt_loop.c')
-rw-r--r--libraries/luajit-2.0/src/lj_opt_loop.c429
1 files changed, 429 insertions, 0 deletions
diff --git a/libraries/luajit-2.0/src/lj_opt_loop.c b/libraries/luajit-2.0/src/lj_opt_loop.c
new file mode 100644
index 0000000..460297b
--- /dev/null
+++ b/libraries/luajit-2.0/src/lj_opt_loop.c
@@ -0,0 +1,429 @@
1/*
2** LOOP: Loop Optimizations.
3** Copyright (C) 2005-2011 Mike Pall. See Copyright Notice in luajit.h
4*/
5
6#define lj_opt_loop_c
7#define LUA_CORE
8
9#include "lj_obj.h"
10
11#if LJ_HASJIT
12
13#include "lj_err.h"
14#include "lj_str.h"
15#include "lj_ir.h"
16#include "lj_jit.h"
17#include "lj_iropt.h"
18#include "lj_trace.h"
19#include "lj_snap.h"
20#include "lj_vm.h"
21
22/* Loop optimization:
23**
24** Traditional Loop-Invariant Code Motion (LICM) splits the instructions
25** of a loop into invariant and variant instructions. The invariant
26** instructions are hoisted out of the loop and only the variant
27** instructions remain inside the loop body.
28**
29** Unfortunately LICM is mostly useless for compiling dynamic languages.
30** The IR has many guards and most of the subsequent instructions are
31** control-dependent on them. The first non-hoistable guard would
32** effectively prevent hoisting of all subsequent instructions.
33**
34** That's why we use a special form of unrolling using copy-substitution,
35** combined with redundancy elimination:
36**
37** The recorded instruction stream is re-emitted to the compiler pipeline
38** with substituted operands. The substitution table is filled with the
39** refs returned by re-emitting each instruction. This can be done
40** on-the-fly, because the IR is in strict SSA form, where every ref is
41** defined before its use.
42**
43** This aproach generates two code sections, separated by the LOOP
44** instruction:
45**
46** 1. The recorded instructions form a kind of pre-roll for the loop. It
47** contains a mix of invariant and variant instructions and performs
48** exactly one loop iteration (but not necessarily the 1st iteration).
49**
50** 2. The loop body contains only the variant instructions and performs
51** all remaining loop iterations.
52**
53** On first sight that looks like a waste of space, because the variant
54** instructions are present twice. But the key insight is that the
55** pre-roll honors the control-dependencies for *both* the pre-roll itself
56** *and* the loop body!
57**
58** It also means one doesn't have to explicitly model control-dependencies
59** (which, BTW, wouldn't help LICM much). And it's much easier to
60** integrate sparse snapshotting with this approach.
61**
62** One of the nicest aspects of this approach is that all of the
63** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be
64** reused with only minor restrictions (e.g. one should not fold
65** instructions across loop-carried dependencies).
66**
67** But in general all optimizations can be applied which only need to look
68** backwards into the generated instruction stream. At any point in time
69** during the copy-substitution process this contains both a static loop
70** iteration (the pre-roll) and a dynamic one (from the to-be-copied
71** instruction up to the end of the partial loop body).
72**
73** Since control-dependencies are implicitly kept, CSE also applies to all
74** kinds of guards. The major advantage is that all invariant guards can
75** be hoisted, too.
76**
77** Load/store forwarding works across loop iterations, too. This is
78** important if loop-carried dependencies are kept in upvalues or tables.
79** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may
80** become a forwarded loop-recurrence after inlining.
81**
82** Since the IR is in SSA form, loop-carried dependencies have to be
83** modeled with PHI instructions. The potential candidates for PHIs are
84** collected on-the-fly during copy-substitution. After eliminating the
85** redundant ones, PHI instructions are emitted *below* the loop body.
86**
87** Note that this departure from traditional SSA form doesn't change the
88** semantics of the PHI instructions themselves. But it greatly simplifies
89** on-the-fly generation of the IR and the machine code.
90*/
91
92/* Some local macros to save typing. Undef'd at the end. */
93#define IR(ref) (&J->cur.ir[(ref)])
94
95/* Pass IR on to next optimization in chain (FOLD). */
96#define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
97
98/* Emit raw IR without passing through optimizations. */
99#define emitir_raw(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
100
101/* -- PHI elimination ----------------------------------------------------- */
102
103/* Emit or eliminate collected PHIs. */
104static void loop_emit_phi(jit_State *J, IRRef1 *subst, IRRef1 *phi, IRRef nphi,
105 SnapNo onsnap)
106{
107 int passx = 0;
108 IRRef i, nslots;
109 IRRef invar = J->chain[IR_LOOP];
110 /* Pass #1: mark redundant and potentially redundant PHIs. */
111 for (i = 0; i < nphi; i++) {
112 IRRef lref = phi[i];
113 IRRef rref = subst[lref];
114 if (lref == rref || rref == REF_DROP) { /* Invariants are redundant. */
115 irt_setmark(IR(lref)->t);
116 } else if (!(IR(rref)->op1 == lref || IR(rref)->op2 == lref)) {
117 /* Quick check for simple recurrences failed, need pass2. */
118 irt_setmark(IR(lref)->t);
119 passx = 1;
120 }
121 }
122 /* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */
123 if (passx) {
124 SnapNo s;
125 for (i = J->cur.nins-1; i > invar; i--) {
126 IRIns *ir = IR(i);
127 if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
128 if (!irref_isk(ir->op1)) {
129 irt_clearmark(IR(ir->op1)->t);
130 if (ir->op1 < invar &&
131 ir->o >= IR_CALLN && ir->o <= IR_CARG) { /* ORDER IR */
132 ir = IR(ir->op1);
133 while (ir->o == IR_CARG) {
134 if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
135 if (irref_isk(ir->op1)) break;
136 ir = IR(ir->op1);
137 irt_clearmark(ir->t);
138 }
139 }
140 }
141 }
142 for (s = J->cur.nsnap-1; s >= onsnap; s--) {
143 SnapShot *snap = &J->cur.snap[s];
144 SnapEntry *map = &J->cur.snapmap[snap->mapofs];
145 MSize n, nent = snap->nent;
146 for (n = 0; n < nent; n++) {
147 IRRef ref = snap_ref(map[n]);
148 if (!irref_isk(ref)) irt_clearmark(IR(ref)->t);
149 }
150 }
151 }
152 /* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */
153 nslots = J->baseslot+J->maxslot;
154 for (i = 1; i < nslots; i++) {
155 IRRef ref = tref_ref(J->slot[i]);
156 while (!irref_isk(ref) && ref != subst[ref]) {
157 IRIns *ir = IR(ref);
158 irt_clearmark(ir->t); /* Unmark potential uses, too. */
159 if (irt_isphi(ir->t) || irt_ispri(ir->t))
160 break;
161 irt_setphi(ir->t);
162 if (nphi >= LJ_MAX_PHI)
163 lj_trace_err(J, LJ_TRERR_PHIOV);
164 phi[nphi++] = (IRRef1)ref;
165 ref = subst[ref];
166 if (ref > invar)
167 break;
168 }
169 }
170 /* Pass #4: propagate non-redundant PHIs. */
171 while (passx) {
172 passx = 0;
173 for (i = 0; i < nphi; i++) {
174 IRRef lref = phi[i];
175 IRIns *ir = IR(lref);
176 if (!irt_ismarked(ir->t)) { /* Propagate only from unmarked PHIs. */
177 IRRef rref = subst[lref];
178 if (lref == rref) { /* Mark redundant PHI. */
179 irt_setmark(ir->t);
180 } else {
181 IRIns *irr = IR(rref);
182 if (irt_ismarked(irr->t)) { /* Right ref points to other PHI? */
183 irt_clearmark(irr->t); /* Mark that PHI as non-redundant. */
184 passx = 1; /* Retry. */
185 }
186 }
187 }
188 }
189 }
190 /* Pass #5: emit PHI instructions or eliminate PHIs. */
191 for (i = 0; i < nphi; i++) {
192 IRRef lref = phi[i];
193 IRIns *ir = IR(lref);
194 if (!irt_ismarked(ir->t)) { /* Emit PHI if not marked. */
195 IRRef rref = subst[lref];
196 if (rref > invar)
197 irt_setphi(IR(rref)->t);
198 emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref);
199 } else { /* Otherwise eliminate PHI. */
200 irt_clearmark(ir->t);
201 irt_clearphi(ir->t);
202 }
203 }
204}
205
206/* -- Loop unrolling using copy-substitution ------------------------------ */
207
208/* Copy-substitute snapshot. */
209static void loop_subst_snap(jit_State *J, SnapShot *osnap,
210 SnapEntry *loopmap, IRRef1 *subst)
211{
212 SnapEntry *nmap, *omap = &J->cur.snapmap[osnap->mapofs];
213 SnapEntry *nextmap = &J->cur.snapmap[snap_nextofs(&J->cur, osnap)];
214 MSize nmapofs;
215 MSize on, ln, nn, onent = osnap->nent;
216 BCReg nslots = osnap->nslots;
217 SnapShot *snap = &J->cur.snap[J->cur.nsnap];
218 if (irt_isguard(J->guardemit)) { /* Guard inbetween? */
219 nmapofs = J->cur.nsnapmap;
220 J->cur.nsnap++; /* Add new snapshot. */
221 } else { /* Otherwise overwrite previous snapshot. */
222 snap--;
223 nmapofs = snap->mapofs;
224 }
225 J->guardemit.irt = 0;
226 /* Setup new snapshot. */
227 snap->mapofs = (uint16_t)nmapofs;
228 snap->ref = (IRRef1)J->cur.nins;
229 snap->nslots = nslots;
230 snap->topslot = osnap->topslot;
231 snap->count = 0;
232 nmap = &J->cur.snapmap[nmapofs];
233 /* Substitute snapshot slots. */
234 on = ln = nn = 0;
235 while (on < onent) {
236 SnapEntry osn = omap[on], lsn = loopmap[ln];
237 if (snap_slot(lsn) < snap_slot(osn)) { /* Copy slot from loop map. */
238 nmap[nn++] = lsn;
239 ln++;
240 } else { /* Copy substituted slot from snapshot map. */
241 if (snap_slot(lsn) == snap_slot(osn)) ln++; /* Shadowed loop slot. */
242 if (!irref_isk(snap_ref(osn)))
243 osn = snap_setref(osn, subst[snap_ref(osn)]);
244 nmap[nn++] = osn;
245 on++;
246 }
247 }
248 while (snap_slot(loopmap[ln]) < nslots) /* Copy remaining loop slots. */
249 nmap[nn++] = loopmap[ln++];
250 snap->nent = (uint8_t)nn;
251 omap += onent;
252 nmap += nn;
253 while (omap < nextmap) /* Copy PC + frame links. */
254 *nmap++ = *omap++;
255 J->cur.nsnapmap = (uint16_t)(nmap - J->cur.snapmap);
256}
257
258/* Unroll loop. */
259static void loop_unroll(jit_State *J)
260{
261 IRRef1 phi[LJ_MAX_PHI];
262 uint32_t nphi = 0;
263 IRRef1 *subst;
264 SnapNo onsnap;
265 SnapShot *osnap, *loopsnap;
266 SnapEntry *loopmap, *psentinel;
267 IRRef ins, invar;
268
269 /* Use temp buffer for substitution table.
270 ** Only non-constant refs in [REF_BIAS,invar) are valid indexes.
271 ** Caveat: don't call into the VM or run the GC or the buffer may be gone.
272 */
273 invar = J->cur.nins;
274 subst = (IRRef1 *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf,
275 (invar-REF_BIAS)*sizeof(IRRef1)) - REF_BIAS;
276 subst[REF_BASE] = REF_BASE;
277
278 /* LOOP separates the pre-roll from the loop body. */
279 emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0);
280
281 /* Grow snapshot buffer and map for copy-substituted snapshots.
282 ** Need up to twice the number of snapshots minus #0 and loop snapshot.
283 ** Need up to twice the number of entries plus fallback substitutions
284 ** from the loop snapshot entries for each new snapshot.
285 ** Caveat: both calls may reallocate J->cur.snap and J->cur.snapmap!
286 */
287 onsnap = J->cur.nsnap;
288 lj_snap_grow_buf(J, 2*onsnap-2);
289 lj_snap_grow_map(J, J->cur.nsnapmap*2+(onsnap-2)*J->cur.snap[onsnap-1].nent);
290
291 /* The loop snapshot is used for fallback substitutions. */
292 loopsnap = &J->cur.snap[onsnap-1];
293 loopmap = &J->cur.snapmap[loopsnap->mapofs];
294 /* The PC of snapshot #0 and the loop snapshot must match. */
295 psentinel = &loopmap[loopsnap->nent];
296 lua_assert(*psentinel == J->cur.snapmap[J->cur.snap[0].nent]);
297 *psentinel = SNAP(255, 0, 0); /* Replace PC with temporary sentinel. */
298
299 /* Start substitution with snapshot #1 (#0 is empty for root traces). */
300 osnap = &J->cur.snap[1];
301
302 /* Copy and substitute all recorded instructions and snapshots. */
303 for (ins = REF_FIRST; ins < invar; ins++) {
304 IRIns *ir;
305 IRRef op1, op2;
306
307 if (ins >= osnap->ref) /* Instruction belongs to next snapshot? */
308 loop_subst_snap(J, osnap++, loopmap, subst); /* Copy-substitute it. */
309
310 /* Substitute instruction operands. */
311 ir = IR(ins);
312 op1 = ir->op1;
313 if (!irref_isk(op1)) op1 = subst[op1];
314 op2 = ir->op2;
315 if (!irref_isk(op2)) op2 = subst[op2];
316 if (irm_kind(lj_ir_mode[ir->o]) == IRM_N &&
317 op1 == ir->op1 && op2 == ir->op2) { /* Regular invariant ins? */
318 subst[ins] = (IRRef1)ins; /* Shortcut. */
319 } else {
320 /* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */
321 IRType1 t = ir->t; /* Get this first, since emitir may invalidate ir. */
322 IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2));
323 subst[ins] = (IRRef1)ref;
324 if (ref != ins && ref < invar) { /* Loop-carried dependency? */
325 IRIns *irr = IR(ref);
326 /* Potential PHI? */
327 if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) {
328 irt_setphi(irr->t);
329 if (nphi >= LJ_MAX_PHI)
330 lj_trace_err(J, LJ_TRERR_PHIOV);
331 phi[nphi++] = (IRRef1)ref;
332 }
333 /* Check all loop-carried dependencies for type instability. */
334 if (!irt_sametype(t, irr->t)) {
335 if (irt_isinteger(t) && irt_isinteger(irr->t))
336 continue;
337 else if (irt_isnum(t) && irt_isinteger(irr->t)) /* Fix int->num. */
338 ref = tref_ref(emitir(IRTN(IR_CONV), ref, IRCONV_NUM_INT));
339 else if (irt_isnum(irr->t) && irt_isinteger(t)) /* Fix num->int. */
340 ref = tref_ref(emitir(IRTGI(IR_CONV), ref,
341 IRCONV_INT_NUM|IRCONV_CHECK));
342 else
343 lj_trace_err(J, LJ_TRERR_TYPEINS);
344 subst[ins] = (IRRef1)ref;
345 /* May need a PHI for the CONV, too. */
346 irr = IR(ref);
347 if (ref < invar && !irref_isk(ref) && !irt_isphi(irr->t)) {
348 irt_setphi(irr->t);
349 if (nphi >= LJ_MAX_PHI)
350 lj_trace_err(J, LJ_TRERR_PHIOV);
351 phi[nphi++] = (IRRef1)ref;
352 }
353 }
354 }
355 }
356 }
357 if (!irt_isguard(J->guardemit)) /* Drop redundant snapshot. */
358 J->cur.nsnapmap = (uint16_t)J->cur.snap[--J->cur.nsnap].mapofs;
359 lua_assert(J->cur.nsnapmap <= J->sizesnapmap);
360 *psentinel = J->cur.snapmap[J->cur.snap[0].nent]; /* Restore PC. */
361
362 loop_emit_phi(J, subst, phi, nphi, onsnap);
363}
364
365/* Undo any partial changes made by the loop optimization. */
366static void loop_undo(jit_State *J, IRRef ins, SnapNo nsnap, MSize nsnapmap)
367{
368 ptrdiff_t i;
369 SnapShot *snap = &J->cur.snap[nsnap-1];
370 SnapEntry *map = J->cur.snapmap;
371 map[snap->mapofs + snap->nent] = map[J->cur.snap[0].nent]; /* Restore PC. */
372 J->cur.nsnapmap = (uint16_t)nsnapmap;
373 J->cur.nsnap = nsnap;
374 J->guardemit.irt = 0;
375 lj_ir_rollback(J, ins);
376 for (i = 0; i < BPROP_SLOTS; i++) { /* Remove backprop. cache entries. */
377 BPropEntry *bp = &J->bpropcache[i];
378 if (bp->val >= ins)
379 bp->key = 0;
380 }
381 for (ins--; ins >= REF_FIRST; ins--) { /* Remove flags. */
382 IRIns *ir = IR(ins);
383 irt_clearphi(ir->t);
384 irt_clearmark(ir->t);
385 }
386}
387
388/* Protected callback for loop optimization. */
389static TValue *cploop_opt(lua_State *L, lua_CFunction dummy, void *ud)
390{
391 UNUSED(L); UNUSED(dummy);
392 loop_unroll((jit_State *)ud);
393 return NULL;
394}
395
396/* Loop optimization. */
397int lj_opt_loop(jit_State *J)
398{
399 IRRef nins = J->cur.nins;
400 SnapNo nsnap = J->cur.nsnap;
401 MSize nsnapmap = J->cur.nsnapmap;
402 int errcode = lj_vm_cpcall(J->L, NULL, J, cploop_opt);
403 if (LJ_UNLIKELY(errcode)) {
404 lua_State *L = J->L;
405 if (errcode == LUA_ERRRUN && tvisnumber(L->top-1)) { /* Trace error? */
406 int32_t e = numberVint(L->top-1);
407 switch ((TraceError)e) {
408 case LJ_TRERR_TYPEINS: /* Type instability. */
409 case LJ_TRERR_GFAIL: /* Guard would always fail. */
410 /* Unrolling via recording fixes many cases, e.g. a flipped boolean. */
411 if (--J->instunroll < 0) /* But do not unroll forever. */
412 break;
413 L->top--; /* Remove error object. */
414 loop_undo(J, nins, nsnap, nsnapmap);
415 return 1; /* Loop optimization failed, continue recording. */
416 default:
417 break;
418 }
419 }
420 lj_err_throw(L, errcode); /* Propagate all other errors. */
421 }
422 return 0; /* Loop optimization is ok. */
423}
424
425#undef IR
426#undef emitir
427#undef emitir_raw
428
429#endif