rc

[fork] interactive rc shell
git clone https://hhvn.uk/rc
git clone git://hhvn.uk/rc
Log | Files | Refs | README | LICENSE

lex.c (11572B)


      1 /* lex.c: rc's lexical analyzer */
      2 
      3 #include "rc.h"
      4 
      5 #include "input.h"
      6 #include "parse.h"
      7 
      8 /*
      9 	Special characters (i.e., "non-word") in rc:
     10 		\t \n # ; & | ^ $ = ~ ` ' { } @ ! ( ) < > \
     11 
     12 	The lexical analyzer is fairly straightforward. The only really
     13 	unclean part concerns backslash continuation and "double
     14 	backslashes". A backslash followed by a newline is treated as a
     15 	space, otherwise backslash is not a special character (i.e.,
     16 	it can be part of a word).  This introduces a host of unwanted
     17 	special cases. In our case, \ cannot be a word character, since
     18 	we wish to read in all word characters in a tight loop.
     19 
     20 	Note: to save the trouble of declaring these arrays with TRUEs
     21 	and FALSEs, I am assuming that FALSE = 0, TRUE = 1. (and so is
     22 	it declared in rc.h)
     23 */
     24 
     25 #define BUFSIZE ((size_t) 1000)	/*	malloc hates power of 2 buffers? */
     26 #define BUFMAX (8 * BUFSIZE)	/* 	How big the buffer can get before we re-allocate the
     27 					space at BUFSIZE again. Premature optimization? Maybe.
     28 				*/
     29 
     30 typedef enum wordstates {
     31 	NW, RW, KW /* "nonword", "realword", "keyword" */
     32 } wordstates;
     33 
     34 static void getpair(int);
     35 
     36 int lineno;
     37 
     38 /* lookup table for non-word characters */
     39 const char nw[] = {
     40 	1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     41 	1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
     42 	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
     43 	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
     44 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     45 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     46 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     47 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
     48 };
     49 
     50 /* lookup table for non-word characters in variable names */
     51 const char dnw[] = {
     52 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     53 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
     54 	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
     55 	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
     56 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     57 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     58 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     59 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
     60 };
     61 
     62 /* lookup table for quotable characters: nw + glob metachars */
     63 const char q[] = {
     64 	1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     65 	1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
     66 	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
     67 	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
     68 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     69 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     70 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     71 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
     72 };
     73 
     74 static size_t bufsize = BUFSIZE;
     75 static char *realbuf = NULL;
     76 static bool newline = FALSE;
     77 static bool errset = FALSE;
     78 static bool prerror = FALSE;
     79 static wordstates w = NW;
     80 static int fd_left, fd_right;
     81 
     82 #define checkfreecaret {if (w != NW) { w = NW; ugchar(c); return '^'; }}
     83 
     84 enum filedescriptors {
     85 	UNSET = -9, CLOSED = -1
     86 };
     87 
     88 /* does this string require quoting? */
     89 extern bool quotep(char *s, bool dollar) {
     90 	unsigned char c;
     91 	const char *meta;
     92 
     93 	meta = dollar ? dnw : q;
     94 	while ((c = *s++))
     95 		if (meta[c])
     96 			return TRUE;
     97 	return FALSE;
     98 }
     99 
    100 extern int yylex() {
    101 	static bool dollar = FALSE;
    102 	bool saw_meta = FALSE;
    103 	int c;
    104 	size_t i;			/* The purpose of all these local assignments is to	*/
    105 	const char *meta;		/* allow optimizing compilers like gcc to load these	*/
    106 	char *buf = realbuf;		/* values into registers. On a sparc this is a		*/
    107 	YYSTYPE *y = &yylval;		/* win, in code size *and* execution time		*/
    108 	if (errset) {
    109 		errset = FALSE;
    110 		return '\n';
    111 	}
    112 	/* rc variable-names may contain only alnum, '*' and '_', so use dnw if we are scanning one. */
    113 	meta = (dollar ? dnw : nw);
    114 	if (newline) {
    115 		--lineno; /* slight space optimization; nextline() always increments lineno */
    116 		nextline();
    117 		newline = FALSE;
    118 	}
    119 top:	while ((c = gchar()) == ' ' || c == '\t')
    120 		w = NW;
    121 	if (c != '(') dollar = FALSE;
    122 	if (c == EOF)
    123 		return END;
    124 	if (!meta[(unsigned char) c]) {	/* it's a word or keyword. */
    125 		checkfreecaret;
    126 		w = RW;
    127 		i = 0;
    128 	read:	do {
    129 			buf[i++] = c;
    130 			if (c == '?' || c == '[' || c == '*')
    131 				saw_meta = TRUE;
    132 			if (i >= bufsize)
    133 				buf = realbuf = erealloc(buf, bufsize *= 2);
    134 		} while ((c = gchar()) != EOF && !meta[(unsigned char) c]);
    135 		while (c == '\\') {
    136 			if ((c = gchar()) == '\n') {
    137 				nextline();
    138 				c = ' '; /* Pretend a space was read */
    139 				break;
    140 			} else {
    141 	bs:			if (meta != dnw) { /* all words but varnames may have a bslash */
    142 					buf[i++] = '\\';
    143 					if (i >= bufsize)
    144 						buf = realbuf = erealloc(buf, bufsize *= 2);
    145 					if (!meta[(unsigned char) c])
    146 						goto read;
    147 				} else {
    148 					ugchar(c);
    149 					c = '\\';
    150 					break;
    151 				}
    152 			}
    153 		}
    154 		ugchar(c);
    155 		buf[i] = '\0';
    156 		w = KW;
    157 		if (i == 2) {
    158 			if (*buf == 'i' && buf[1] == 'f') return IF;
    159 			if (*buf == 'f' && buf[1] == 'n') return FN;
    160 			if (*buf == 'i' && buf[1] == 'n') return IN;
    161 		}
    162 		if (streq(buf, "not")) return NOT;
    163 		if (streq(buf, "for")) return FOR;
    164 		if (streq(buf, "else")) return ELSE;
    165 		if (streq(buf, "switch")) return SWITCH;
    166 		if (streq(buf, "while")) return WHILE;
    167 		if (streq(buf, "case")) return CASE;
    168 		w = RW;
    169 		y->word.w = ncpy(buf);
    170 		if (saw_meta) {
    171 			char *r, *s;
    172 
    173 			y->word.m = nalloc(strlen(buf) + 1);
    174 			for (r = buf, s = y->word.m; *r != '\0'; r++, s++)
    175 				*s = (*r == '?' || *r == '[' || *r == '*');
    176 		} else {
    177 			y->word.m = NULL;
    178 		}
    179 		y->word.q = FALSE;
    180 		return WORD;
    181 	}
    182 	if (c == '`' || c == '!' || c == '@' || c == '~' || c == '$' || c == '\'' || c == '=') {
    183 		checkfreecaret;
    184 		if (c == '!' || c == '@' || c == '~' || c == '=')
    185 			w = KW;
    186 	}
    187 	switch (c) {
    188 	case '!':
    189 		return BANG;
    190 	case '@':
    191 		return SUBSHELL;
    192 	case '~':
    193 		return TWIDDLE;
    194 	case '`':
    195 		c = gchar();
    196 		if (c == '`')
    197 			return BACKBACK;
    198 		ugchar(c);
    199 		return '`';
    200 	case '$':
    201 		dollar = TRUE;
    202 		c = gchar();
    203 		if (c == '#')
    204 			return COUNT;
    205 		if (c == '^' || c == '"')
    206 			return FLAT;
    207 		ugchar(c);
    208 		return '$';
    209 	case '\'':
    210 		w = RW;
    211 		i = 0;
    212 		/* double ' to quote it, like this: 'how''s it going?' */
    213 		while ((c = gchar()) != '\'' || (c = gchar()) == '\'') {
    214 			buf[i++] = c;
    215 			if (c == '\n')
    216 				nextline();
    217 			if (c == EOF) {
    218 				w = NW;
    219 				scanerror("eof in quoted string");
    220 				return HUH;
    221 			}
    222 			if (i >= bufsize)
    223 				buf = realbuf = erealloc(buf, bufsize *= 2);
    224 		}
    225 		ugchar(c);
    226 		buf[i] = '\0';
    227 		y->word.w = ncpy(buf);
    228 		y->word.m = NULL;
    229 		y->word.q = TRUE;
    230 		return WORD;
    231 	case '\\':
    232 		if ((c = gchar()) == '\n') {
    233 			nextline();
    234 			goto top; /* Pretend it was just another space. */
    235 		}
    236 		ugchar(c);
    237 		c = '\\';
    238 		checkfreecaret;
    239 		c = gchar();
    240 		i = 0;
    241 		goto bs;
    242 	case '(':
    243 		if (w == RW) /* SUB's happen only after real words, not keywords, so if () and while () work */
    244 			c = SUB;
    245 		w = NW;
    246 		return c;
    247 	case '#':
    248 		while ((c = gchar()) != '\n') /* skip comment until newline */
    249 			if (c == EOF)
    250 				return END;
    251 		/* FALLTHROUGH */
    252 	case '\n':
    253 		lineno++;
    254 		newline = TRUE;
    255 		/* FALLTHROUGH */
    256 	case ';':
    257 	case '^':
    258 	case ')':
    259 	case '{': case '}':
    260 		w = NW;
    261 	case '=':
    262 		return c;
    263 	case '&':
    264 		w = NW;
    265 		c = gchar();
    266 		if (c == '&')
    267 			return ANDAND;
    268 		ugchar(c);
    269 		return '&';
    270 	case '|':
    271 		w = NW;
    272 		c = gchar();
    273 		if (c == '|')
    274 			return OROR;
    275 		getpair(c);
    276 		if (errset)
    277 			return HUH;
    278 		if ((y->pipe.left = fd_left) == UNSET)
    279 			y->pipe.left = 1;				/* default to fd 1 */
    280 		if ((y->pipe.right = fd_right) == UNSET)
    281 			y->pipe.right = 0;				/* default to fd 0 */
    282 		if (y->pipe.right == CLOSED) {
    283 			scanerror("expected digit after '='");		/* can't close a pipe */
    284 			return HUH;
    285 		}
    286 		return PIPE;
    287 	case '>':
    288 		c = gchar();
    289 		if (c == '>') {
    290 			c = gchar();
    291 			y->redir.type = rAppend;
    292 		} else
    293 			y->redir.type = rCreate;
    294 		y->redir.fd = 1;
    295 		goto common;
    296 	case '<':
    297 		c = gchar();
    298 		if (c == '<') {
    299 			c = gchar();
    300 			if (c == '<') {
    301 				c = gchar();
    302 				y->redir.type = rHerestring;
    303 			} else {
    304 				y->redir.type = rHeredoc;
    305 			}
    306 		} else
    307 			y->redir.type = rFrom;
    308 		y->redir.fd = 0;
    309 	common:
    310 		w = NW;
    311 		getpair(c);
    312 		if (errset)
    313 			return HUH;
    314 		if (fd_right == UNSET) { /* redirection, not dup */
    315 			if (fd_left != UNSET) {
    316 				y->redir.fd = fd_left;
    317 				return SREDIR;
    318 			}
    319 			return (y->redir.type == rFrom || y->redir.type == rCreate) ? REDIR : SREDIR;
    320 		} else { /* dup; recast yylval */
    321 			y->dup.type = y->redir.type;
    322 			y->dup.left = fd_left;
    323 			y->dup.right = fd_right;
    324 			return DUP;
    325 		}
    326 	default:
    327 		w = NW;
    328 		return c; /* don't know what it is, let yacc barf on it */
    329 	}
    330 }
    331 
    332 extern void yyerror(const char *s) {
    333 	char *tok;
    334 	if (prerror) { /* don't print "syntax error" if there's a more informative scanerror */
    335 		prerror = FALSE;
    336 		return;
    337 	}
    338 	if (!interactive) {
    339 		if (w != NW)
    340 			tok = realbuf;
    341 		else if (lastchar == EOF)
    342 			tok = "eof";
    343 		else if (lastchar == '\n')
    344 			tok = "end of line";
    345 		else
    346 			tok = nprint((lastchar < 32 || lastchar > 126) ? "(decimal %d)" : "'%c'", lastchar);
    347 		fprint(2, "rc: line %d: %s near %s\n", lineno - (lastchar == '\n'), s, tok);
    348 	} else
    349 		fprint(2, "rc: %s\n", s);
    350 }
    351 
    352 extern void scanerror(char *s) {
    353 	skiptonl(); /* flush up to newline */
    354 	yyerror(s);
    355 	errset = prerror = TRUE;
    356 }
    357 
    358 extern void inityy() {
    359 	newline = FALSE;
    360 	w = NW;
    361 	hq = NULL;
    362 	/* return memory to the system if the buffer got too large */
    363 	if (bufsize > BUFMAX && realbuf != NULL) {
    364 		efree(realbuf);
    365 		bufsize = BUFSIZE;
    366 		realbuf = ealloc(bufsize);
    367 	} else if (realbuf == NULL)
    368 		realbuf = ealloc(bufsize);
    369 }
    370 
    371 /*
    372    Scan in a pair of integers for redirections like >[2=1]. CLOSED represents a closed file
    373    descriptor (i.e., >[2=]) and UNSET represents an undesignated file descriptor (e.g.,
    374    >[2] is represented as (2,UNSET).
    375 
    376    This function makes use of unsigned compares to make range tests in one compare operation.
    377 */
    378 
    379 static void getpair(int c) {
    380 	int n;
    381 	fd_left = fd_right = UNSET;
    382 	if (c != '[') {
    383 		ugchar(c);
    384 		return;
    385 	}
    386 	if ((unsigned int) (n = gchar() - '0') > 9) {
    387 		scanerror("expected digit after '['");
    388 		return;
    389 	}
    390 	while ((unsigned int) (c = gchar() - '0') <= 9)
    391 		n = n * 10 + c;
    392 	fd_left = n;
    393 	c += '0';
    394 	switch (c) {
    395 	default:
    396 		scanerror("expected '=' or ']' after digit");
    397 		return;
    398 	case ']':
    399 		return;
    400 	case '=':
    401 		if ((unsigned int) (n = gchar() - '0') > 9) {
    402 			if (n != ']' - '0') {
    403 				scanerror("expected digit or ']' after '='");
    404 				return;
    405 			}
    406 			fd_right = CLOSED;
    407 		} else {
    408 			while ((unsigned int) (c = gchar() - '0') <= 9)
    409 				n = n * 10 + c;
    410 			if (c != ']' - '0') {
    411 				scanerror("expected ']' after digit");
    412 				return;
    413 			}
    414 			fd_right = n;
    415 		}
    416 	}
    417 }