lex.c (11572B)
1 /* lex.c: rc's lexical analyzer */ 2 3 #include "rc.h" 4 5 #include "input.h" 6 #include "parse.h" 7 8 /* 9 Special characters (i.e., "non-word") in rc: 10 \t \n # ; & | ^ $ = ~ ` ' { } @ ! ( ) < > \ 11 12 The lexical analyzer is fairly straightforward. The only really 13 unclean part concerns backslash continuation and "double 14 backslashes". A backslash followed by a newline is treated as a 15 space, otherwise backslash is not a special character (i.e., 16 it can be part of a word). This introduces a host of unwanted 17 special cases. In our case, \ cannot be a word character, since 18 we wish to read in all word characters in a tight loop. 19 20 Note: to save the trouble of declaring these arrays with TRUEs 21 and FALSEs, I am assuming that FALSE = 0, TRUE = 1. (and so is 22 it declared in rc.h) 23 */ 24 25 #define BUFSIZE ((size_t) 1000) /* malloc hates power of 2 buffers? */ 26 #define BUFMAX (8 * BUFSIZE) /* How big the buffer can get before we re-allocate the 27 space at BUFSIZE again. Premature optimization? Maybe. 28 */ 29 30 typedef enum wordstates { 31 NW, RW, KW /* "nonword", "realword", "keyword" */ 32 } wordstates; 33 34 static void getpair(int); 35 36 int lineno; 37 38 /* lookup table for non-word characters */ 39 const char nw[] = { 40 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 41 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 42 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 43 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 44 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 45 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 46 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 47 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 48 }; 49 50 /* lookup table for non-word characters in variable names */ 51 const char dnw[] = { 52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 54 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 55 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 56 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 57 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 58 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 59 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 60 }; 61 62 /* lookup table for quotable characters: nw + glob metachars */ 63 const char q[] = { 64 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 66 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 67 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 72 }; 73 74 static size_t bufsize = BUFSIZE; 75 static char *realbuf = NULL; 76 static bool newline = FALSE; 77 static bool errset = FALSE; 78 static bool prerror = FALSE; 79 static wordstates w = NW; 80 static int fd_left, fd_right; 81 82 #define checkfreecaret {if (w != NW) { w = NW; ugchar(c); return '^'; }} 83 84 enum filedescriptors { 85 UNSET = -9, CLOSED = -1 86 }; 87 88 /* does this string require quoting? */ 89 extern bool quotep(char *s, bool dollar) { 90 unsigned char c; 91 const char *meta; 92 93 meta = dollar ? dnw : q; 94 while ((c = *s++)) 95 if (meta[c]) 96 return TRUE; 97 return FALSE; 98 } 99 100 extern int yylex() { 101 static bool dollar = FALSE; 102 bool saw_meta = FALSE; 103 int c; 104 size_t i; /* The purpose of all these local assignments is to */ 105 const char *meta; /* allow optimizing compilers like gcc to load these */ 106 char *buf = realbuf; /* values into registers. On a sparc this is a */ 107 YYSTYPE *y = &yylval; /* win, in code size *and* execution time */ 108 if (errset) { 109 errset = FALSE; 110 return '\n'; 111 } 112 /* rc variable-names may contain only alnum, '*' and '_', so use dnw if we are scanning one. */ 113 meta = (dollar ? dnw : nw); 114 if (newline) { 115 --lineno; /* slight space optimization; nextline() always increments lineno */ 116 nextline(); 117 newline = FALSE; 118 } 119 top: while ((c = gchar()) == ' ' || c == '\t') 120 w = NW; 121 if (c != '(') dollar = FALSE; 122 if (c == EOF) 123 return END; 124 if (!meta[(unsigned char) c]) { /* it's a word or keyword. */ 125 checkfreecaret; 126 w = RW; 127 i = 0; 128 read: do { 129 buf[i++] = c; 130 if (c == '?' || c == '[' || c == '*') 131 saw_meta = TRUE; 132 if (i >= bufsize) 133 buf = realbuf = erealloc(buf, bufsize *= 2); 134 } while ((c = gchar()) != EOF && !meta[(unsigned char) c]); 135 while (c == '\\') { 136 if ((c = gchar()) == '\n') { 137 nextline(); 138 c = ' '; /* Pretend a space was read */ 139 break; 140 } else { 141 bs: if (meta != dnw) { /* all words but varnames may have a bslash */ 142 buf[i++] = '\\'; 143 if (i >= bufsize) 144 buf = realbuf = erealloc(buf, bufsize *= 2); 145 if (!meta[(unsigned char) c]) 146 goto read; 147 } else { 148 ugchar(c); 149 c = '\\'; 150 break; 151 } 152 } 153 } 154 ugchar(c); 155 buf[i] = '\0'; 156 w = KW; 157 if (i == 2) { 158 if (*buf == 'i' && buf[1] == 'f') return IF; 159 if (*buf == 'f' && buf[1] == 'n') return FN; 160 if (*buf == 'i' && buf[1] == 'n') return IN; 161 } 162 if (streq(buf, "not")) return NOT; 163 if (streq(buf, "for")) return FOR; 164 if (streq(buf, "else")) return ELSE; 165 if (streq(buf, "switch")) return SWITCH; 166 if (streq(buf, "while")) return WHILE; 167 if (streq(buf, "case")) return CASE; 168 w = RW; 169 y->word.w = ncpy(buf); 170 if (saw_meta) { 171 char *r, *s; 172 173 y->word.m = nalloc(strlen(buf) + 1); 174 for (r = buf, s = y->word.m; *r != '\0'; r++, s++) 175 *s = (*r == '?' || *r == '[' || *r == '*'); 176 } else { 177 y->word.m = NULL; 178 } 179 y->word.q = FALSE; 180 return WORD; 181 } 182 if (c == '`' || c == '!' || c == '@' || c == '~' || c == '$' || c == '\'' || c == '=') { 183 checkfreecaret; 184 if (c == '!' || c == '@' || c == '~' || c == '=') 185 w = KW; 186 } 187 switch (c) { 188 case '!': 189 return BANG; 190 case '@': 191 return SUBSHELL; 192 case '~': 193 return TWIDDLE; 194 case '`': 195 c = gchar(); 196 if (c == '`') 197 return BACKBACK; 198 ugchar(c); 199 return '`'; 200 case '$': 201 dollar = TRUE; 202 c = gchar(); 203 if (c == '#') 204 return COUNT; 205 if (c == '^' || c == '"') 206 return FLAT; 207 ugchar(c); 208 return '$'; 209 case '\'': 210 w = RW; 211 i = 0; 212 /* double ' to quote it, like this: 'how''s it going?' */ 213 while ((c = gchar()) != '\'' || (c = gchar()) == '\'') { 214 buf[i++] = c; 215 if (c == '\n') 216 nextline(); 217 if (c == EOF) { 218 w = NW; 219 scanerror("eof in quoted string"); 220 return HUH; 221 } 222 if (i >= bufsize) 223 buf = realbuf = erealloc(buf, bufsize *= 2); 224 } 225 ugchar(c); 226 buf[i] = '\0'; 227 y->word.w = ncpy(buf); 228 y->word.m = NULL; 229 y->word.q = TRUE; 230 return WORD; 231 case '\\': 232 if ((c = gchar()) == '\n') { 233 nextline(); 234 goto top; /* Pretend it was just another space. */ 235 } 236 ugchar(c); 237 c = '\\'; 238 checkfreecaret; 239 c = gchar(); 240 i = 0; 241 goto bs; 242 case '(': 243 if (w == RW) /* SUB's happen only after real words, not keywords, so if () and while () work */ 244 c = SUB; 245 w = NW; 246 return c; 247 case '#': 248 while ((c = gchar()) != '\n') /* skip comment until newline */ 249 if (c == EOF) 250 return END; 251 /* FALLTHROUGH */ 252 case '\n': 253 lineno++; 254 newline = TRUE; 255 /* FALLTHROUGH */ 256 case ';': 257 case '^': 258 case ')': 259 case '{': case '}': 260 w = NW; 261 case '=': 262 return c; 263 case '&': 264 w = NW; 265 c = gchar(); 266 if (c == '&') 267 return ANDAND; 268 ugchar(c); 269 return '&'; 270 case '|': 271 w = NW; 272 c = gchar(); 273 if (c == '|') 274 return OROR; 275 getpair(c); 276 if (errset) 277 return HUH; 278 if ((y->pipe.left = fd_left) == UNSET) 279 y->pipe.left = 1; /* default to fd 1 */ 280 if ((y->pipe.right = fd_right) == UNSET) 281 y->pipe.right = 0; /* default to fd 0 */ 282 if (y->pipe.right == CLOSED) { 283 scanerror("expected digit after '='"); /* can't close a pipe */ 284 return HUH; 285 } 286 return PIPE; 287 case '>': 288 c = gchar(); 289 if (c == '>') { 290 c = gchar(); 291 y->redir.type = rAppend; 292 } else 293 y->redir.type = rCreate; 294 y->redir.fd = 1; 295 goto common; 296 case '<': 297 c = gchar(); 298 if (c == '<') { 299 c = gchar(); 300 if (c == '<') { 301 c = gchar(); 302 y->redir.type = rHerestring; 303 } else { 304 y->redir.type = rHeredoc; 305 } 306 } else 307 y->redir.type = rFrom; 308 y->redir.fd = 0; 309 common: 310 w = NW; 311 getpair(c); 312 if (errset) 313 return HUH; 314 if (fd_right == UNSET) { /* redirection, not dup */ 315 if (fd_left != UNSET) { 316 y->redir.fd = fd_left; 317 return SREDIR; 318 } 319 return (y->redir.type == rFrom || y->redir.type == rCreate) ? REDIR : SREDIR; 320 } else { /* dup; recast yylval */ 321 y->dup.type = y->redir.type; 322 y->dup.left = fd_left; 323 y->dup.right = fd_right; 324 return DUP; 325 } 326 default: 327 w = NW; 328 return c; /* don't know what it is, let yacc barf on it */ 329 } 330 } 331 332 extern void yyerror(const char *s) { 333 char *tok; 334 if (prerror) { /* don't print "syntax error" if there's a more informative scanerror */ 335 prerror = FALSE; 336 return; 337 } 338 if (!interactive) { 339 if (w != NW) 340 tok = realbuf; 341 else if (lastchar == EOF) 342 tok = "eof"; 343 else if (lastchar == '\n') 344 tok = "end of line"; 345 else 346 tok = nprint((lastchar < 32 || lastchar > 126) ? "(decimal %d)" : "'%c'", lastchar); 347 fprint(2, "rc: line %d: %s near %s\n", lineno - (lastchar == '\n'), s, tok); 348 } else 349 fprint(2, "rc: %s\n", s); 350 } 351 352 extern void scanerror(char *s) { 353 skiptonl(); /* flush up to newline */ 354 yyerror(s); 355 errset = prerror = TRUE; 356 } 357 358 extern void inityy() { 359 newline = FALSE; 360 w = NW; 361 hq = NULL; 362 /* return memory to the system if the buffer got too large */ 363 if (bufsize > BUFMAX && realbuf != NULL) { 364 efree(realbuf); 365 bufsize = BUFSIZE; 366 realbuf = ealloc(bufsize); 367 } else if (realbuf == NULL) 368 realbuf = ealloc(bufsize); 369 } 370 371 /* 372 Scan in a pair of integers for redirections like >[2=1]. CLOSED represents a closed file 373 descriptor (i.e., >[2=]) and UNSET represents an undesignated file descriptor (e.g., 374 >[2] is represented as (2,UNSET). 375 376 This function makes use of unsigned compares to make range tests in one compare operation. 377 */ 378 379 static void getpair(int c) { 380 int n; 381 fd_left = fd_right = UNSET; 382 if (c != '[') { 383 ugchar(c); 384 return; 385 } 386 if ((unsigned int) (n = gchar() - '0') > 9) { 387 scanerror("expected digit after '['"); 388 return; 389 } 390 while ((unsigned int) (c = gchar() - '0') <= 9) 391 n = n * 10 + c; 392 fd_left = n; 393 c += '0'; 394 switch (c) { 395 default: 396 scanerror("expected '=' or ']' after digit"); 397 return; 398 case ']': 399 return; 400 case '=': 401 if ((unsigned int) (n = gchar() - '0') > 9) { 402 if (n != ']' - '0') { 403 scanerror("expected digit or ']' after '='"); 404 return; 405 } 406 fd_right = CLOSED; 407 } else { 408 while ((unsigned int) (c = gchar() - '0') <= 9) 409 n = n * 10 + c; 410 if (c != ']' - '0') { 411 scanerror("expected ']' after digit"); 412 return; 413 } 414 fd_right = n; 415 } 416 } 417 }