1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
|
--[[--------------------------------------------------------------------
llex.lua
Lua 5.1 lexical analyzer in Lua
This file is part of Yueliang.
Copyright (c) 2008 Kein-Hong Man <khman@users.sf.net>
The COPYRIGHT file describes the conditions
under which this software may be distributed.
See the ChangeLog for more information.
----------------------------------------------------------------------]]
--[[--------------------------------------------------------------------
-- Notes:
-- * takes in the entire source at once
-- * greatly simplified chunkid, error handling
-- * NO shbang handling (it's done elsewhere in Lua 5.1)
-- * NO localized decimal point replacement magic
-- * NO limit to number of lines (MAX_INT = 2147483645)
-- * NO support for compatible long strings (LUA_COMPAT_LSTR)
-- * NO next(), lookahead() because I want next() to set tok and
-- seminfo that are locals, and that can only be done easily in
-- lparser, not llex. lastline would be handled in lparser too.
--
-- Usage example:
-- local llex = require("llex_mk2")
-- llex.init(source_code, source_code_name)
-- repeat
-- local token, seminfo = llex.llex()
-- until token == "<eof>"
--
----------------------------------------------------------------------]]
local base = _G
local string = require "string"
module "llex"
----------------------------------------------------------------------
-- initialize keyword list
----------------------------------------------------------------------
local kw = {}
for v in string.gmatch([[
and break do else elseif end false for function if in
local nil not or repeat return then true until while]], "%S+") do
kw[v] = true
end
----------------------------------------------------------------------
-- initialize lexer for given source _z and source name _sourceid
----------------------------------------------------------------------
local z, sourceid, I
local find = string.find
local match = string.match
local sub = string.sub
function init(_z, _sourceid)
z = _z -- source
sourceid = _sourceid -- name of source
I = 1 -- lexer's position in source
ln = 1 -- line number
end
----------------------------------------------------------------------
-- returns a chunk name or id, no truncation for long names
----------------------------------------------------------------------
function chunkid()
if sourceid and match(sourceid, "^[=@]") then
return sub(sourceid, 2) -- remove first char
end
return "[string]"
end
----------------------------------------------------------------------
-- formats error message and throws error
-- * a simplified version, does not report what token was responsible
----------------------------------------------------------------------
function errorline(s, line)
base.error(string.format("%s:%d: %s", chunkid(), line or ln, s))
end
----------------------------------------------------------------------
-- handles line number incrementation and end-of-line characters
----------------------------------------------------------------------
local function inclinenumber(i)
local sub = sub
local old = sub(z, i, i)
i = i + 1 -- skip '\n' or '\r'
local c = sub(z, i, i)
if (c == "\n" or c == "\r") and (c ~= old) then
i = i + 1 -- skip '\n\r' or '\r\n'
end
ln = ln + 1
I = i
return i
end
------------------------------------------------------------------------
-- count separators ("=") in a long string delimiter
------------------------------------------------------------------------
local function skip_sep(i)
local sub = sub
local s = sub(z, i, i)
i = i + 1
local count = #match(z, "=*", i) -- note, take the length
i = i + count
I = i
return (sub(z, i, i) == s) and count or (-count) - 1
end
----------------------------------------------------------------------
-- reads a long string or long comment
----------------------------------------------------------------------
local function read_long_string(is_str, sep)
local i = I + 1 -- skip 2nd '['
local sub = sub
local buff = ""
local c = sub(z, i, i)
if c == "\r" or c == "\n" then -- string starts with a newline?
i = inclinenumber(i) -- skip it
end
local j = i
while true do
local p, q, r = find(z, "([\r\n%]])", i) -- (long range)
if not p then
errorline(is_str and "unfinished long string" or
"unfinished long comment")
end
if is_str then
buff = buff..sub(z, i, p - 1) -- save string portions
end
i = p
if r == "]" then -- delimiter test
if skip_sep(i) == sep then
i = I + 1 -- skip 2nd ']'
break
end
buff = buff..sub(z, i, I - 1)
i = I
else -- newline
buff = buff.."\n"
i = inclinenumber(i)
end
end--while
I = i
return buff
end
----------------------------------------------------------------------
-- reads a string
----------------------------------------------------------------------
local function read_string(del)
local i = I
local find = find
local sub = sub
local buff = ""
while true do
local p, q, r = find(z, "([\n\r\\\"\'])", i) -- (long range)
if p then
if r == "\n" or r == "\r" then
errorline("unfinished string")
end
buff = buff..sub(z, i, p - 1) -- normal portions
i = p
if r == "\\" then -- handle escapes
i = i + 1
r = sub(z, i, i)
if r == "" then break end -- (EOZ error)
p = find("abfnrtv\n\r", r, 1, true)
------------------------------------------------------
if p then -- special escapes
if p > 7 then
r = "\n"
i = inclinenumber(i)
else
r = sub("\a\b\f\n\r\t\v", p, p)
i = i + 1
end
------------------------------------------------------
elseif find(r, "%D") then -- other non-digits
i = i + 1
------------------------------------------------------
else -- \xxx sequence
local p, q, s = find(z, "^(%d%d?%d?)", i)
i = q + 1
if s + 1 > 256 then -- UCHAR_MAX
errorline("escape sequence too large")
end
r = string.char(s)
------------------------------------------------------
end--if p
else
i = i + 1
if r == del then -- ending delimiter
I = i; return buff -- return string
end
end--if r
buff = buff..r -- handled escapes falls through to here
else
break -- (error)
end--if p
end--while
errorline("unfinished string")
end
------------------------------------------------------------------------
-- main lexer function
------------------------------------------------------------------------
function llex()
local find = find
local match = match
while true do--outer
local i = I
-- inner loop allows break to be used to nicely section tests
while true do--inner
----------------------------------------------------------------
local p, _, r = find(z, "^([_%a][_%w]*)", i)
if p then
I = i + #r
if kw[r] then return r end -- reserved word (keyword)
return "<name>", r -- identifier
end
----------------------------------------------------------------
local p, _, r = find(z, "^(%.?)%d", i)
if p then -- numeral
if r == "." then i = i + 1 end
local _, q, r = find(z, "^%d*[%.%d]*([eE]?)", i)
i = q + 1
if #r == 1 then -- optional exponent
if match(z, "^[%+%-]", i) then -- optional sign
i = i + 1
end
end
local _, q = find(z, "^[_%w]*", i)
I = q + 1
local v = base.tonumber(sub(z, p, q)) -- handles hex also
if not v then errorline("malformed number") end
return "<number>", v
end
----------------------------------------------------------------
local p, q, r = find(z, "^(%s)[ \t]*", i)
if p then
if r == "\n" or r == "\r" then -- newline
inclinenumber(i)
else
I = q + 1 -- whitespace
end
break -- (continue)
end
----------------------------------------------------------------
local r = match(z, "^%p", i)
if r then
local p = find("-[\"\'.=<>~", r, 1, true)
if p then
-- two-level if block for punctuation/symbols
--------------------------------------------------------
if p <= 2 then
if p == 1 then -- minus
local c = match(z, "^%-%-(%[?)", i)
if c then
i = i + 2
local sep = -1
if c == "[" then
sep = skip_sep(i)
end
if sep >= 0 then -- long comment
read_long_string(false, sep)
else -- short comment
I = find(z, "[\n\r]", i) or (#z + 1)
end
break -- (continue)
end
-- (fall through for "-")
else -- [ or long string
local sep = skip_sep(i)
if sep >= 0 then
return "<string>", read_long_string(true, sep)
elseif sep == -1 then
return "["
else
errorline("invalid long string delimiter")
end
end
--------------------------------------------------------
elseif p <= 5 then
if p < 5 then -- strings
I = i + 1
return "<string>", read_string(r)
end
r = match(z, "^%.%.?%.?", i) -- .|..|... dots
-- (fall through)
--------------------------------------------------------
else -- relational
r = match(z, "^%p=?", i)
-- (fall through)
end
end
I = i + #r; return r -- for other symbols, fall through
end
----------------------------------------------------------------
local r = sub(z, i, i)
if r ~= "" then
I = i + 1; return r -- other single-char tokens
end
return "<eof>" -- end of stream
----------------------------------------------------------------
end--while inner
end--while outer
end
return base.getfenv()
|