Files
lua/src/llex.c

597 lines
17 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
** $Id: llex.c $
** Lexical Analyzer
** See Copyright Notice in lua.h
*/
#define llex_c
#define LUA_CORE
#include "lprefix.h"
#include <locale.h>
#include <string.h>
#include "lua.h"
#include "lctype.h"
#include "ldebug.h"
#include "ldo.h"
#include "lgc.h"
#include "llex.h"
#include "lobject.h"
#include "lparser.h"
#include "lstate.h"
#include "lstring.h"
#include "ltable.h"
#include "lzio.h"
// 获取下一个字符
#define next(ls) (ls->current = zgetc(ls->z))
// 当前字符是新行吗?
#define currIsNewline(ls) (ls->current == '\n' || ls->current == '\r')
/* ORDER RESERVED */
static const char *const luaX_tokens [] = {
"and", "break", "do", "else", "elseif",
"end", "false", "for", "function", "goto", "if",
"in", "local", "nil", "not", "or", "repeat",
"return", "then", "true", "until", "while",
"//", "..", "...", "==", ">=", "<=", "~=",
"<<", ">>", "::", "<eof>",
"<number>", "<integer>", "<name>", "<string>"
};
// 把当前字符保存到缓冲区,并且读取下一个字符
#define save_and_next(ls) (save(ls, ls->current), next(ls))
// 报错
static l_noret lexerror (LexState *ls, const char *msg, int token);
// 把当前字符保存到缓冲区
// 如果超出缓冲区大小,则更新缓冲区大小
static void save (LexState *ls, int c) {
Mbuffer *b = ls->buff;
if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
size_t newsize;
if (luaZ_sizebuffer(b) >= MAX_SIZE/2)
lexerror(ls, "lexical element too long", 0);
newsize = luaZ_sizebuffer(b) * 2;
// 这里会重新申请内存
luaZ_resizebuffer(ls->L, b, newsize);
}
b->buffer[luaZ_bufflen(b)++] = cast_char(c);
}
void luaX_init (lua_State *L) {
int i;
// 创建一个新的字符串
TString *e = luaS_newliteral(L, LUA_ENV); /* create env name */
luaC_fix(L, obj2gco(e)); /* never collect this name */
for (i=0; i<NUM_RESERVED; i++) {
// 创建保留字字符串
TString *ts = luaS_new(L, luaX_tokens[i]);
// 永远不要回收保留字字符串
luaC_fix(L, obj2gco(ts)); /* reserved words are never collected */
// 字符串对应的符号码ts->extra+255== RESERVED
ts->extra = cast_byte(i+1); /* reserved word */
}
}
const char *luaX_token2str (LexState *ls, int token) {
if (token < FIRST_RESERVED) { /* single-byte symbols? */
if (lisprint(token))
return luaO_pushfstring(ls->L, "'%c'", token);
else /* control character */
return luaO_pushfstring(ls->L, "'<\\%d>'", token);
}
else {
const char *s = luaX_tokens[token - FIRST_RESERVED];
if (token < TK_EOS) /* fixed format (symbols and reserved words)? */
return luaO_pushfstring(ls->L, "'%s'", s);
else /* names, strings, and numerals */
return s;
}
}
static const char *txtToken (LexState *ls, int token) {
switch (token) {
case TK_NAME: case TK_STRING:
case TK_FLT: case TK_INT:
save(ls, '\0');
return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff));
default:
return luaX_token2str(ls, token);
}
}
static l_noret lexerror (LexState *ls, const char *msg, int token) {
msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber);
if (token)
luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
luaD_throw(ls->L, LUA_ERRSYNTAX);
}
l_noret luaX_syntaxerror (LexState *ls, const char *msg) {
lexerror(ls, msg, ls->t.token);
}
/*
** Creates a new string and anchors it in scanner's table so that it
** will not be collected until the end of the compilation; by that time
** it should be anchored somewhere. It also internalizes long strings,
** ensuring there is only one copy of each unique string. The table
** here is used as a set: the string enters as the key, while its value
** is irrelevant. We use the string itself as the value only because it
** is a TValue readily available. Later, the code generation can change
** this value.
*/
TString *luaX_newstring (LexState *ls, const char *str, size_t l) {
lua_State *L = ls->L;
TString *ts = luaS_newlstr(L, str, l); /* create new string */
const TValue *o = luaH_getstr(ls->h, ts);
if (!ttisnil(o)) /* string already present? */
ts = keystrval(nodefromval(o)); /* get saved copy */
else { /* not in use yet */
TValue *stv = s2v(L->top.p++); /* reserve stack space for string */
setsvalue(L, stv, ts); /* temporarily anchor the string */
luaH_finishset(L, ls->h, stv, o, stv); /* t[string] = string */
/* table is not a metatable, so it does not need to invalidate cache */
luaC_checkGC(L);
L->top.p--; /* remove string from stack */
}
return ts;
}
/*
** increment line number and skips newline sequence (any of
** \n, \r, \n\r, or \r\n)
*/
static void inclinenumber (LexState *ls) {
int old = ls->current;
lua_assert(currIsNewline(ls));
next(ls); /* skip '\n' or '\r' */
if (currIsNewline(ls) && ls->current != old)
next(ls); /* skip '\n\r' or '\r\n' */
if (++ls->linenumber >= MAX_INT)
lexerror(ls, "chunk has too many lines", 0);
}
void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source,
int firstchar) {
ls->t.token = 0;
ls->L = L;
ls->current = firstchar;
ls->lookahead.token = TK_EOS; /* no look-ahead token */
ls->z = z;
ls->fs = NULL;
ls->linenumber = 1;
ls->lastline = 1;
ls->source = source;
ls->envn = luaS_newliteral(L, LUA_ENV); /* get env name */
luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER); /* initialize buffer */
}
/*
** =======================================================
** LEXICAL ANALYZER
** =======================================================
*/
static int check_next1 (LexState *ls, int c) {
if (ls->current == c) {
next(ls);
return 1;
}
else return 0;
}
/*
** Check whether current char is in set 'set' (with two chars) and
** saves it
*/
static int check_next2 (LexState *ls, const char *set) {
lua_assert(set[2] == '\0');
if (ls->current == set[0] || ls->current == set[1]) {
save_and_next(ls);
return 1;
}
else return 0;
}
/* LUA_NUMBER */
/*
** This function is quite liberal in what it accepts, as 'luaO_str2num'
** will reject ill-formed numerals. Roughly, it accepts the following
** pattern:
**
** %d(%x|%.|([Ee][+-]?))* | 0[Xx](%x|%.|([Pp][+-]?))*
**
** The only tricky part is to accept [+-] only after a valid exponent
** mark, to avoid reading '3-4' or '0xe+1' as a single number.
**
** The caller might have already read an initial dot.
*/
static int read_numeral (LexState *ls, SemInfo *seminfo) {
TValue obj;
const char *expo = "Ee";
int first = ls->current;
lua_assert(lisdigit(ls->current));
save_and_next(ls);
if (first == '0' && check_next2(ls, "xX")) /* hexadecimal? */
expo = "Pp";
for (;;) {
if (check_next2(ls, expo)) /* exponent mark? */
check_next2(ls, "-+"); /* optional exponent sign */
else if (lisxdigit(ls->current) || ls->current == '.') /* '%x|%.' */
save_and_next(ls);
else break;
}
if (lislalpha(ls->current)) /* is numeral touching a letter? */
save_and_next(ls); /* force an error */
save(ls, '\0');
if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0) /* format error? */
lexerror(ls, "malformed number", TK_FLT);
if (ttisinteger(&obj)) {
seminfo->i = ivalue(&obj);
return TK_INT;
}
else {
lua_assert(ttisfloat(&obj));
seminfo->r = fltvalue(&obj);
return TK_FLT;
}
}
/*
** read a sequence '[=*[' or ']=*]', leaving the last bracket. If
** sequence is well formed, return its number of '='s + 2; otherwise,
** return 1 if it is a single bracket (no '='s and no 2nd bracket);
** otherwise (an unfinished '[==...') return 0.
*/
static size_t skip_sep (LexState *ls) {
size_t count = 0;
int s = ls->current;
lua_assert(s == '[' || s == ']');
save_and_next(ls);
while (ls->current == '=') {
save_and_next(ls);
count++;
}
return (ls->current == s) ? count + 2
: (count == 0) ? 1
: 0;
}
static void read_long_string (LexState *ls, SemInfo *seminfo, size_t sep) {
int line = ls->linenumber; /* initial line (for error message) */
save_and_next(ls); /* skip 2nd '[' */
if (currIsNewline(ls)) /* string starts with a newline? */
inclinenumber(ls); /* skip it */
for (;;) {
switch (ls->current) {
case EOZ: { /* error */
const char *what = (seminfo ? "string" : "comment");
const char *msg = luaO_pushfstring(ls->L,
"unfinished long %s (starting at line %d)", what, line);
lexerror(ls, msg, TK_EOS);
break; /* to avoid warnings */
}
case ']': {
if (skip_sep(ls) == sep) {
save_and_next(ls); /* skip 2nd ']' */
goto endloop;
}
break;
}
case '\n': case '\r': {
save(ls, '\n');
inclinenumber(ls);
if (!seminfo) luaZ_resetbuffer(ls->buff); /* avoid wasting space */
break;
}
default: {
if (seminfo) save_and_next(ls);
else next(ls);
}
}
} endloop:
if (seminfo)
seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
luaZ_bufflen(ls->buff) - 2 * sep);
}
static void esccheck (LexState *ls, int c, const char *msg) {
if (!c) {
if (ls->current != EOZ)
save_and_next(ls); /* add current to buffer for error message */
lexerror(ls, msg, TK_STRING);
}
}
static int gethexa (LexState *ls) {
save_and_next(ls);
esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected");
return luaO_hexavalue(ls->current);
}
static int readhexaesc (LexState *ls) {
int r = gethexa(ls);
r = (r << 4) + gethexa(ls);
luaZ_buffremove(ls->buff, 2); /* remove saved chars from buffer */
return r;
}
static unsigned long readutf8esc (LexState *ls) {
unsigned long r;
int i = 4; /* chars to be removed: '\', 'u', '{', and first digit */
save_and_next(ls); /* skip 'u' */
esccheck(ls, ls->current == '{', "missing '{'");
r = gethexa(ls); /* must have at least one digit */
while (cast_void(save_and_next(ls)), lisxdigit(ls->current)) {
i++;
esccheck(ls, r <= (0x7FFFFFFFu >> 4), "UTF-8 value too large");
r = (r << 4) + luaO_hexavalue(ls->current);
}
esccheck(ls, ls->current == '}', "missing '}'");
next(ls); /* skip '}' */
luaZ_buffremove(ls->buff, i); /* remove saved chars from buffer */
return r;
}
static void utf8esc (LexState *ls) {
char buff[UTF8BUFFSZ];
int n = luaO_utf8esc(buff, readutf8esc(ls));
for (; n > 0; n--) /* add 'buff' to string */
save(ls, buff[UTF8BUFFSZ - n]);
}
static int readdecesc (LexState *ls) {
int i;
int r = 0; /* result accumulator */
for (i = 0; i < 3 && lisdigit(ls->current); i++) { /* read up to 3 digits */
r = 10*r + ls->current - '0';
save_and_next(ls);
}
esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
luaZ_buffremove(ls->buff, i); /* remove read digits from buffer */
return r;
}
static void read_string (LexState *ls, int del, SemInfo *seminfo) {
save_and_next(ls); /* keep delimiter (for error messages) */
while (ls->current != del) {
switch (ls->current) {
case EOZ:
lexerror(ls, "unfinished string", TK_EOS);
break; /* to avoid warnings */
case '\n':
case '\r':
lexerror(ls, "unfinished string", TK_STRING);
break; /* to avoid warnings */
case '\\': { /* escape sequences */
int c; /* final character to be saved */
save_and_next(ls); /* keep '\\' for error messages */
switch (ls->current) {
case 'a': c = '\a'; goto read_save;
case 'b': c = '\b'; goto read_save;
case 'f': c = '\f'; goto read_save;
case 'n': c = '\n'; goto read_save;
case 'r': c = '\r'; goto read_save;
case 't': c = '\t'; goto read_save;
case 'v': c = '\v'; goto read_save;
case 'x': c = readhexaesc(ls); goto read_save;
case 'u': utf8esc(ls); goto no_save;
case '\n': case '\r':
inclinenumber(ls); c = '\n'; goto only_save;
case '\\': case '\"': case '\'':
c = ls->current; goto read_save;
case EOZ: goto no_save; /* will raise an error next loop */
case 'z': { /* zap following span of spaces */
luaZ_buffremove(ls->buff, 1); /* remove '\\' */
next(ls); /* skip the 'z' */
while (lisspace(ls->current)) {
if (currIsNewline(ls)) inclinenumber(ls);
else next(ls);
}
goto no_save;
}
default: {
esccheck(ls, lisdigit(ls->current), "invalid escape sequence");
c = readdecesc(ls); /* digital escape '\ddd' */
goto only_save;
}
}
read_save:
next(ls);
/* go through */
only_save:
luaZ_buffremove(ls->buff, 1); /* remove '\\' */
save(ls, c);
/* go through */
no_save: break;
}
default:
save_and_next(ls);
}
}
save_and_next(ls); /* skip delimiter */
seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
luaZ_bufflen(ls->buff) - 2);
}
// ls分词器状态管理; seminfo语义信息
static int llex (LexState *ls, SemInfo *seminfo) {
// buff中缓存清零
luaZ_resetbuffer(ls->buff);
for (;;) {
switch (ls->current) {
case '\n': case '\r': { /* line breaks */
// 跳过换行符
inclinenumber(ls);
break;
}
case ' ': case '\f': case '\t': case '\v': { /* spaces */
// 跳过空白字符
next(ls);
break;
}
case '-': { /* '-' or '--' (comment) */
next(ls);
// 返回 - 号
if (ls->current != '-') return '-';
/* else is a comment */
next(ls);
if (ls->current == '[') { /* long comment? */
// 跳过长注释
size_t sep = skip_sep(ls);
luaZ_resetbuffer(ls->buff); /* 'skip_sep' may dirty the buffer */
if (sep >= 2) {
read_long_string(ls, NULL, sep); /* skip long comment */
luaZ_resetbuffer(ls->buff); /* previous call may dirty the buff. */
break;
}
}
// 跳过短注释
/* else short comment */
while (!currIsNewline(ls) && ls->current != EOZ)
next(ls); /* skip until end of line (or end of file) */
break;
}
case '[': { /* long string or simply '[' */
size_t sep = skip_sep(ls);
if (sep >= 2) {
read_long_string(ls, seminfo, sep);
return TK_STRING;
}
else if (sep == 0) /* '[=...' missing second bracket? */
lexerror(ls, "invalid long string delimiter", TK_STRING);
return '[';
}
case '=': {
next(ls);
if (check_next1(ls, '=')) return TK_EQ; /* '==' */
else return '=';
}
case '<': {
next(ls);
if (check_next1(ls, '=')) return TK_LE; /* '<=' */
else if (check_next1(ls, '<')) return TK_SHL; /* '<<' */
else return '<';
}
case '>': {
next(ls);
if (check_next1(ls, '=')) return TK_GE; /* '>=' */
else if (check_next1(ls, '>')) return TK_SHR; /* '>>' */
else return '>';
}
case '/': {
next(ls);
if (check_next1(ls, '/')) return TK_IDIV; /* '//' */
else return '/';
}
case '~': {
next(ls);
if (check_next1(ls, '=')) return TK_NE; /* '~=' */
else return '~';
}
case ':': {
next(ls);
if (check_next1(ls, ':')) return TK_DBCOLON; /* '::' */
else return ':';
}
case '"': case '\'': { /* short literal strings */
read_string(ls, ls->current, seminfo);
return TK_STRING;
}
case '.': { /* '.', '..', '...', or number */
save_and_next(ls);
if (check_next1(ls, '.')) {
if (check_next1(ls, '.'))
return TK_DOTS; /* '...' */
else return TK_CONCAT; /* '..' */
}
else if (!lisdigit(ls->current)) return '.';
else return read_numeral(ls, seminfo);
}
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9': {
return read_numeral(ls, seminfo);
}
case EOZ: {
return TK_EOS;
}
default: {
if (lislalpha(ls->current)) { /* identifier or reserved word? */
TString *ts;
do {
save_and_next(ls);
} while (lislalnum(ls->current));
ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
luaZ_bufflen(ls->buff));
seminfo->ts = ts;
if (isreserved(ts)) /* reserved word? */
return ts->extra - 1 + FIRST_RESERVED;
else {
return TK_NAME;
}
}
else { /* single-char tokens ('+', '*', '%', '{', '}', ...) */
int c = ls->current;
next(ls);
return c;
}
}
}
}
}
void luaX_next (LexState *ls) {
ls->lastline = ls->linenumber;
if (ls->lookahead.token != TK_EOS) { /* is there a look-ahead token? */
ls->t = ls->lookahead; /* use this one */
ls->lookahead.token = TK_EOS; /* and discharge it */
}
else
ls->t.token = llex(ls, &ls->t.seminfo); /* read next token */
}
int luaX_lookahead (LexState *ls) {
lua_assert(ls->lookahead.token == TK_EOS);
ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
return ls->lookahead.token;
}