lex.c 11.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 * Copyright 2011 Jacek Caban for CodeWeavers
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
 */

#include <assert.h>
20
#include <limits.h>
21
#include <math.h>
22 23 24 25 26 27 28 29 30

#include "vbscript.h"
#include "parse.h"
#include "parser.tab.h"

#include "wine/debug.h"

WINE_DEFAULT_DEBUG_CHANNEL(vbscript);

31 32 33 34
static const struct {
    const WCHAR *word;
    int token;
} keywords[] = {
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
    {L"and",       tAND},
    {L"byref",     tBYREF},
    {L"byval",     tBYVAL},
    {L"call",      tCALL},
    {L"case",      tCASE},
    {L"class",     tCLASS},
    {L"const",     tCONST},
    {L"default",   tDEFAULT},
    {L"dim",       tDIM},
    {L"do",        tDO},
    {L"each",      tEACH},
    {L"else",      tELSE},
    {L"elseif",    tELSEIF},
    {L"empty",     tEMPTY},
    {L"end",       tEND},
    {L"eqv",       tEQV},
    {L"error",     tERROR},
    {L"exit",      tEXIT},
    {L"explicit",  tEXPLICIT},
    {L"false",     tFALSE},
    {L"for",       tFOR},
    {L"function",  tFUNCTION},
    {L"get",       tGET},
    {L"goto",      tGOTO},
    {L"if",        tIF},
    {L"imp",       tIMP},
    {L"in",        tIN},
    {L"is",        tIS},
    {L"let",       tLET},
    {L"loop",      tLOOP},
    {L"me",        tME},
    {L"mod",       tMOD},
    {L"new",       tNEW},
    {L"next",      tNEXT},
    {L"not",       tNOT},
    {L"nothing",   tNOTHING},
    {L"null",      tNULL},
    {L"on",        tON},
    {L"option",    tOPTION},
    {L"or",        tOR},
75
    {L"preserve",  tPRESERVE},
76 77 78
    {L"private",   tPRIVATE},
    {L"property",  tPROPERTY},
    {L"public",    tPUBLIC},
79
    {L"redim",     tREDIM},
80 81 82 83 84 85 86 87 88 89 90 91 92
    {L"rem",       tREM},
    {L"resume",    tRESUME},
    {L"select",    tSELECT},
    {L"set",       tSET},
    {L"step",      tSTEP},
    {L"stop",      tSTOP},
    {L"sub",       tSUB},
    {L"then",      tTHEN},
    {L"to",        tTO},
    {L"true",      tTRUE},
    {L"until",     tUNTIL},
    {L"wend",      tWEND},
    {L"while",     tWHILE},
93
    {L"with",      tWITH},
94
    {L"xor",       tXOR}
95 96
};

97 98
static inline BOOL is_identifier_char(WCHAR c)
{
99
    return iswalnum(c) || c == '_';
100 101
}

102
static int check_keyword(parser_ctx_t *ctx, const WCHAR *word, const WCHAR **lval)
103 104 105 106 107 108
{
    const WCHAR *p1 = ctx->ptr;
    const WCHAR *p2 = word;
    WCHAR c;

    while(p1 < ctx->end && *p2) {
109
        c = towlower(*p1);
110 111 112 113 114 115 116 117 118 119
        if(c != *p2)
            return c - *p2;
        p1++;
        p2++;
    }

    if(*p2 || (p1 < ctx->end && is_identifier_char(*p1)))
        return 1;

    ctx->ptr = p1;
120
    *lval = word;
121 122 123
    return 0;
}

124
static int check_keywords(parser_ctx_t *ctx, const WCHAR **lval)
125
{
126
    int min = 0, max = ARRAY_SIZE(keywords)-1, r, i;
127 128 129 130

    while(min <= max) {
        i = (min+max)/2;

131
        r = check_keyword(ctx, keywords[i].word, lval);
132 133 134 135 136 137 138 139 140 141 142 143
        if(!r)
            return keywords[i].token;

        if(r > 0)
            min = i+1;
        else
            max = i-1;
    }

    return 0;
}

144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
static int parse_identifier(parser_ctx_t *ctx, const WCHAR **ret)
{
    const WCHAR *ptr = ctx->ptr++;
    WCHAR *str;
    int len;

    while(ctx->ptr < ctx->end && is_identifier_char(*ctx->ptr))
        ctx->ptr++;
    len = ctx->ptr-ptr;

    str = parser_alloc(ctx, (len+1)*sizeof(WCHAR));
    if(!str)
        return 0;

    memcpy(str, ptr, (len+1)*sizeof(WCHAR));
    str[len] = 0;
    *ret = str;
    return tIdentifier;
}

164 165 166 167 168 169 170
static int parse_string_literal(parser_ctx_t *ctx, const WCHAR **ret)
{
    const WCHAR *ptr = ++ctx->ptr;
    WCHAR *rptr;
    int len = 0;

    while(ctx->ptr < ctx->end) {
171
        if(*ctx->ptr == '\n' || *ctx->ptr == '\r') {
172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
            FIXME("newline inside string literal\n");
            return 0;
        }

       if(*ctx->ptr == '"') {
            if(ctx->ptr[1] != '"')
                break;
            len--;
            ctx->ptr++;
        }
        ctx->ptr++;
    }

    if(ctx->ptr == ctx->end) {
        FIXME("unterminated string literal\n");
        return 0;
    }

    len += ctx->ptr-ptr;

    *ret = rptr = parser_alloc(ctx, (len+1)*sizeof(WCHAR));
    if(!rptr)
        return 0;

    while(ptr < ctx->ptr) {
        if(*ptr == '"')
            ptr++;
        *rptr++ = *ptr++;
    }

    *rptr = 0;
    ctx->ptr++;
    return tString;
}

207 208
static int parse_numeric_literal(parser_ctx_t *ctx, void **ret)
{
209
    BOOL use_int = TRUE;
210 211
    LONGLONG d = 0, hlp;
    int exp = 0;
212
    double r;
213 214 215 216

    if(*ctx->ptr == '0' && !('0' <= ctx->ptr[1] && ctx->ptr[1] <= '9') && ctx->ptr[1] != '.')
        return *ctx->ptr++;

217
    while(ctx->ptr < ctx->end && is_digit(*ctx->ptr)) {
218 219 220 221 222 223 224 225
        hlp = d*10 + *(ctx->ptr++) - '0';
        if(d>MAXLONGLONG/10 || hlp<0) {
            exp++;
            break;
        }
        else
            d = hlp;
    }
226
    while(ctx->ptr < ctx->end && is_digit(*ctx->ptr)) {
227 228 229
        exp++;
        ctx->ptr++;
    }
230

231 232
    if(*ctx->ptr == '.') {
        use_int = FALSE;
233 234
        ctx->ptr++;

235
        while(ctx->ptr < ctx->end && is_digit(*ctx->ptr)) {
236 237 238 239 240 241 242
            hlp = d*10 + *(ctx->ptr++) - '0';
            if(d>MAXLONGLONG/10 || hlp<0)
                break;

            d = hlp;
            exp--;
        }
243
        while(ctx->ptr < ctx->end && is_digit(*ctx->ptr))
244
            ctx->ptr++;
245 246
    }

247 248 249
    if(*ctx->ptr == 'e' || *ctx->ptr == 'E') {
        int e = 0, sign = 1;

250 251
        ctx->ptr++;
        if(*ctx->ptr == '-') {
252 253
            ctx->ptr++;
            sign = -1;
254 255
        }else if(*ctx->ptr == '+') {
            ctx->ptr++;
256 257
        }

258
        if(!is_digit(*ctx->ptr)) {
259 260 261 262 263 264 265 266 267 268
            FIXME("Invalid numeric literal\n");
            return 0;
        }

        use_int = FALSE;

        do {
            e = e*10 + *(ctx->ptr++) - '0';
            if(sign == -1 && -e+exp < -(INT_MAX/100)) {
                /* The literal will be rounded to 0 anyway. */
269
                while(is_digit(*ctx->ptr))
270 271 272 273 274 275 276 277 278
                    ctx->ptr++;
                *(double*)ret = 0;
                return tDouble;
            }

            if(sign*e + exp > INT_MAX/100) {
                FIXME("Invalid numeric literal\n");
                return 0;
            }
279
        } while(is_digit(*ctx->ptr));
280 281 282 283 284

        exp += sign*e;
    }

    if(use_int && (LONG)d == d) {
285
        *(LONG*)ret = d;
286
        return tInt;
287 288 289 290 291 292 293 294 295
    }

    r = exp>=0 ? d*pow(10, exp) : d/pow(10, -exp);
    if(isinf(r)) {
        FIXME("Invalid numeric literal\n");
        return 0;
    }

    *(double*)ret = r;
296 297 298
    return tDouble;
}

299 300 301 302 303 304 305 306 307 308 309 310 311 312
static int hex_to_int(WCHAR c)
{
    if('0' <= c && c <= '9')
        return c-'0';
    if('a' <= c && c <= 'f')
        return c+10-'a';
    if('A' <= c && c <= 'F')
        return c+10-'A';
    return -1;
}

static int parse_hex_literal(parser_ctx_t *ctx, LONG *ret)
{
    const WCHAR *begin = ctx->ptr;
313
    unsigned l = 0, d;
314 315 316 317

    while((d = hex_to_int(*++ctx->ptr)) != -1)
        l = l*16 + d;

318
    if(begin + 9 /* max digits+1 */ < ctx->ptr || (*ctx->ptr != '&' && is_identifier_char(*ctx->ptr))) {
319 320 321 322
        FIXME("invalid literal\n");
        return 0;
    }

323
    if(*ctx->ptr == '&') {
324
        ctx->ptr++;
325 326 327 328
        *ret = l;
    }else {
        *ret = l == (UINT16)l ? (INT16)l : l;
    }
329
    return tInt;
330 331
}

332 333
static void skip_spaces(parser_ctx_t *ctx)
{
334
    while(*ctx->ptr == ' ' || *ctx->ptr == '\t')
335 336 337
        ctx->ptr++;
}

338 339
static int comment_line(parser_ctx_t *ctx)
{
340
    static const WCHAR newlineW[] = {'\n','\r',0};
341
    ctx->ptr = wcspbrk(ctx->ptr, newlineW);
342 343 344 345 346 347 348
    if(ctx->ptr)
        ctx->ptr++;
    else
        ctx->ptr = ctx->end;
    return tNL;
}

349
static int parse_next_token(void *lval, unsigned *loc, parser_ctx_t *ctx)
350 351 352
{
    WCHAR c;

353
    skip_spaces(ctx);
354
    *loc = ctx->ptr - ctx->code;
355
    if(ctx->ptr == ctx->end)
356
        return ctx->last_token == tNL ? 0 : tNL;
357 358

    c = *ctx->ptr;
359

360 361 362
    if('0' <= c && c <= '9')
        return parse_numeric_literal(ctx, lval);

363
    if(iswalpha(c)) {
364 365 366
        int ret = 0;
        if(ctx->last_token != '.' && ctx->last_token != tDOT)
            ret = check_keywords(ctx, lval);
367 368
        if(!ret)
            return parse_identifier(ctx, lval);
369 370 371
        if(ret != tREM)
            return ret;
        c = '\'';
372
    }
373

374 375
    switch(c) {
    case '\n':
376
    case '\r':
377 378 379
        ctx->ptr++;
        return tNL;
    case '\'':
380
        return comment_line(ctx);
381
    case ':':
382 383 384 385 386 387 388 389
    case ')':
    case ',':
    case '=':
    case '+':
    case '*':
    case '/':
    case '^':
    case '\\':
390
    case '_':
391
        return *ctx->ptr++;
392 393 394 395 396 397 398 399 400
    case '.':
        /*
         * We need to distinguish between '.' used as part of a member expression and
         * a beginning of a dot expression (a member expression accessing with statement
         * expression).
         */
        c = ctx->ptr > ctx->code ? ctx->ptr[-1] : '\n';
        ctx->ptr++;
        return is_identifier_char(c) || c == ')' ? '.' : tDOT;
401 402 403 404 405
    case '-':
        if(ctx->is_html && ctx->ptr[1] == '-' && ctx->ptr[2] == '>')
            return comment_line(ctx);
        ctx->ptr++;
        return '-';
406 407 408 409 410 411 412 413 414 415 416
    case '(':
        /* NOTE:
         * We resolve empty brackets in lexer instead of parser to avoid complex conflicts
         * in call statement special case |f()| without 'call' keyword
         */
        ctx->ptr++;
        skip_spaces(ctx);
        if(*ctx->ptr == ')') {
            ctx->ptr++;
            return tEMPTYBRACKETS;
        }
417 418 419 420 421 422 423
        /*
         * Parser can't predict if bracket is part of argument expression or an argument
         * in call expression. We predict it here instead.
         */
        if(ctx->last_token == tIdentifier || ctx->last_token == ')')
            return '(';
        return tEXPRLBRACKET;
424 425
    case '"':
        return parse_string_literal(ctx, lval);
426 427 428 429
    case '&':
        if(*++ctx->ptr == 'h' || *ctx->ptr == 'H')
            return parse_hex_literal(ctx, lval);
        return '&';
430 431 432 433 434 435 436 437
    case '<':
        switch(*++ctx->ptr) {
        case '>':
            ctx->ptr++;
            return tNEQ;
        case '=':
            ctx->ptr++;
            return tLTEQ;
438 439 440
        case '!':
            if(ctx->is_html && ctx->ptr[1] == '-' && ctx->ptr[2] == '-')
                return comment_line(ctx);
441 442 443 444 445 446 447 448
        }
        return '<';
    case '>':
        if(*++ctx->ptr == '=') {
            ctx->ptr++;
            return tGTEQ;
        }
        return '>';
449 450 451 452 453
    default:
        FIXME("Unhandled char %c in %s\n", *ctx->ptr, debugstr_w(ctx->ptr));
    }

    return 0;
454 455
}

456
int parser_lex(void *lval, unsigned *loc, parser_ctx_t *ctx)
457 458 459
{
    int ret;

460 461 462 463 464 465
    if (ctx->last_token == tEXPRESSION)
    {
        ctx->last_token = tNL;
        return tEXPRESSION;
    }

466
    while(1) {
467
        ret = parse_next_token(lval, loc, ctx);
468 469
        if(ret == '_') {
            skip_spaces(ctx);
470
            if(*ctx->ptr != '\n' && *ctx->ptr != '\r') {
471 472 473
                FIXME("'_' not followed by newline\n");
                return 0;
            }
474 475 476 477
            if(*ctx->ptr == '\r')
                ctx->ptr++;
            if(*ctx->ptr == '\n')
                ctx->ptr++;
478 479
            continue;
        }
480 481 482 483 484 485 486 487
        if(ret != tNL || ctx->last_token != tNL)
            break;

        ctx->last_nl = ctx->ptr-ctx->code;
    }

    return (ctx->last_token = ret);
}