3 * Copyright (c) 2001-2003,
4 * George C. Necula <necula@cs.berkeley.edu>
5 * Scott McPeak <smcpeak@cs.berkeley.edu>
6 * Wes Weimer <weimer@cs.berkeley.edu>
7 * Ben Liblit <liblit@cs.berkeley.edu>
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions are
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
21 * 3. The names of the contributors may not be used to endorse or promote
22 * products derived from this software without specific prior written
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
26 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
28 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
29 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 (* FrontC -- lexical analyzer
40 ** 1.0 3.22.99 Hugues Cassé First version.
41 ** 2.0 George Necula 12/12/00: Many extensions
53 let cp = lb.lex_curr_p in
54 lb.lex_curr_p <- { cp with pos_lnum = 1 + cp.pos_lnum }
56 let setCurrentLine lb lineno =
57 let cp = lb.lex_curr_p in
58 lb.lex_curr_p <- { cp with pos_lnum = lineno }
60 let setCurrentFile lb file =
61 let cp = lb.lex_curr_p in
62 lb.lex_curr_p <- { cp with pos_fname = file }
64 let matchingParsOpen = ref 0
66 let currentLoc = Cabshelper.currentLoc_lexbuf
68 let int64_to_char value =
69 assert (value <= 255L && value >= 0L);
70 Char.chr (Int64.to_int value)
72 (* takes a not-nul-terminated list, and converts it to a string. *)
73 let rec intlist_to_string (str: int64 list):string =
75 [] -> "" (* add nul-termination *)
77 let this_char = int64_to_char value in
78 (String.make 1 this_char) ^ (intlist_to_string rest)
83 let lexicon = H.create 211
87 (fun (key, builder) -> H.add lexicon key builder)
88 [ ("_Bool", fun loc -> UNDERSCORE_BOOL loc);
89 ("auto", fun loc -> AUTO loc);
90 ("const", fun loc -> CONST loc);
91 ("__const", fun loc -> CONST loc);
92 ("__const__", fun loc -> CONST loc);
93 ("static", fun loc -> STATIC loc);
94 ("extern", fun loc -> EXTERN loc);
95 ("long", fun loc -> LONG loc);
96 ("short", fun loc -> SHORT loc);
97 ("register", fun loc -> REGISTER loc);
98 ("signed", fun loc -> SIGNED loc);
99 ("__signed", fun loc -> SIGNED loc);
100 ("unsigned", fun loc -> UNSIGNED loc);
101 ("volatile", fun loc -> VOLATILE loc);
102 ("__volatile", fun loc -> VOLATILE loc);
103 (* WW: see /usr/include/sys/cdefs.h for why __signed and __volatile
104 * are accepted GCC-isms *)
105 ("char", fun loc -> CHAR loc);
106 ("int", fun loc -> INT loc);
107 ("float", fun loc -> FLOAT loc);
108 ("double", fun loc -> DOUBLE loc);
109 ("void", fun loc -> VOID loc);
110 ("enum", fun loc -> ENUM loc);
111 ("struct", fun loc -> STRUCT loc);
112 ("typedef", fun loc -> TYPEDEF loc);
113 ("union", fun loc -> UNION loc);
114 ("break", fun loc -> BREAK loc);
115 ("continue", fun loc -> CONTINUE loc);
116 ("goto", fun loc -> GOTO loc);
117 ("return", fun loc -> RETURN loc);
118 ("switch", fun loc -> SWITCH loc);
119 ("case", fun loc -> CASE loc);
120 ("default", fun loc -> DEFAULT loc);
121 ("while", fun loc -> WHILE loc);
122 ("do", fun loc -> DO loc);
123 ("for", fun loc -> FOR loc);
124 ("if", fun loc -> IF loc);
125 ("else", fun _ -> ELSE);
126 (*** Implementation specific keywords ***)
127 ("__signed__", fun loc -> SIGNED loc);
128 ("__inline__", fun loc -> INLINE loc);
129 ("inline", fun loc -> INLINE loc);
130 ("__inline", fun loc -> INLINE loc);
131 ("_inline", fun loc ->
135 IDENT ("_inline", loc));
136 ("__attribute__", fun loc -> ATTRIBUTE loc);
137 ("__attribute", fun loc -> ATTRIBUTE loc);
139 ("__attribute_used__", fun loc -> ATTRIBUTE_USED loc);
141 ("__blockattribute__", fun _ -> BLOCKATTRIBUTE);
142 ("__blockattribute", fun _ -> BLOCKATTRIBUTE);
143 ("__asm__", fun loc -> ASM loc);
144 ("asm", fun loc -> ASM loc);
145 ("__typeof__", fun loc -> TYPEOF loc);
146 ("__typeof", fun loc -> TYPEOF loc);
147 ("typeof", fun loc -> TYPEOF loc);
148 ("__alignof", fun loc -> ALIGNOF loc);
149 ("__alignof__", fun loc -> ALIGNOF loc);
150 ("__volatile__", fun loc -> VOLATILE loc);
151 ("__volatile", fun loc -> VOLATILE loc);
153 ("__FUNCTION__", fun loc -> FUNCTION__ loc);
154 ("__func__", fun loc -> FUNCTION__ loc); (* ISO 6.4.2.2 *)
155 ("__PRETTY_FUNCTION__", fun loc -> PRETTY_FUNCTION__ loc);
156 ("__label__", fun _ -> LABEL__);
157 (*** weimer: GCC arcana ***)
158 ("__restrict", fun loc -> RESTRICT loc);
159 ("restrict", fun loc -> RESTRICT loc);
160 (* ("__extension__", EXTENSION); *)
162 ("__int64", fun loc -> INT64 loc);
163 ("__int32", fun loc -> INT loc);
164 ("_cdecl", fun loc -> MSATTR ("_cdecl", loc));
165 ("__cdecl", fun loc -> MSATTR ("__cdecl", loc));
166 ("_stdcall", fun loc -> MSATTR ("_stdcall", loc));
167 ("__stdcall", fun loc -> MSATTR ("__stdcall", loc));
168 ("_fastcall", fun loc -> MSATTR ("_fastcall", loc));
169 ("__fastcall", fun loc -> MSATTR ("__fastcall", loc));
170 ("__w64", fun loc -> MSATTR("__w64", loc));
171 ("__declspec", fun loc -> DECLSPEC loc);
172 ("__forceinline", fun loc -> INLINE loc); (* !! we turn forceinline
174 ("__try", fun loc -> TRY loc);
175 ("__except", fun loc -> EXCEPT loc);
176 ("__finally", fun loc -> FINALLY loc);
177 (* weimer: some files produced by 'GCC -E' expect this type to be
179 ("__builtin_va_list", fun loc -> NAMED_TYPE ("__builtin_va_list", loc));
180 ("__builtin_va_arg", fun loc -> BUILTIN_VA_ARG loc);
181 ("__builtin_types_compatible_p", fun loc -> BUILTIN_TYPES_COMPAT loc);
182 ("__builtin_offsetof", fun loc -> BUILTIN_OFFSETOF loc);
183 (* On some versions of GCC __thread is a regular identifier *)
184 ("__thread", fun loc -> THREAD loc)
187 (* Mark an identifier as a type name. The old mapping is preserved and will
188 * be reinstated when we exit this context *)
190 (* ignore (print_string ("adding type name " ^ name ^ "\n")); *)
191 H.add lexicon name (fun loc -> NAMED_TYPE (name, loc))
193 let context : string list list ref = ref []
195 let push_context _ = context := []::!context
202 List.iter (fun name ->
203 (* ignore (print_string ("removing lexicon for " ^ name ^ "\n")); *)
204 H.remove lexicon name) con)
206 (* Mark an identifier as a variable name. The old mapping is preserved and
207 * will be reinstated when we exit this context *)
208 let add_identifier name =
210 [] -> () (* Just ignore raise (InternalError "Empty context stack") *)
212 context := (name::con)::sub;
213 H.add lexicon name (fun loc -> IDENT (name, loc))
219 let scan_ident lb id =
220 let here = currentLoc lb in
221 try (H.find lexicon id) here
222 (* default to variable name, as opposed to type *)
223 with Not_found -> IDENT (id, here)
233 (* Inititialize the pointer in Errormsg *)
234 Parse_aux.add_type := add_type;
235 Parse_aux.push_context := push_context;
236 Parse_aux.pop_context := pop_context;
237 Parse_aux.add_identifier := add_identifier;
240 let cp = lb.lex_curr_p in
241 lb.lex_curr_p <- {cp with pos_lnum = 1};
247 (*** Error handling ***)
248 let error = parse_error
251 (*** escape character management ***)
252 let scan_escape (char: char) : int64 =
253 let result = match char with
258 | 'f' -> '\012' (* ASCII code 12 *)
259 | 'v' -> '\011' (* ASCII code 11 *)
260 | 'a' -> '\007' (* ASCII code 7 *)
261 | 'e' | 'E' -> '\027' (* ASCII code 27. This is a GCC extension *)
263 | '"'-> '"' (* '"' *)
265 | '(' when not !msvcMode -> '('
266 | '{' when not !msvcMode -> '{'
267 | '[' when not !msvcMode -> '['
268 | '%' when not !msvcMode -> '%'
270 | other -> error ("Unrecognized escape sequence: \\" ^ (String.make 1 other)); raise Parsing.Parse_error
272 Int64.of_int (Char.code result)
274 let scan_hex_escape str =
275 let radix = Int64.of_int 16 in
276 let the_value = ref Int64.zero in
277 (* start at character 2 to skip the \x *)
278 for i = 2 to (String.length str) - 1 do
279 let thisDigit = Cabshelper.valueOfDigit (String.get str i) in
280 (* the_value := !the_value * 16 + thisDigit *)
281 the_value := Int64.add (Int64.mul !the_value radix) thisDigit
285 let scan_oct_escape str =
286 let radix = Int64.of_int 8 in
287 let the_value = ref Int64.zero in
288 (* start at character 1 to skip the \x *)
289 for i = 1 to (String.length str) - 1 do
290 let thisDigit = Cabshelper.valueOfDigit (String.get str i) in
291 (* the_value := !the_value * 8 + thisDigit *)
292 the_value := Int64.add (Int64.mul !the_value radix) thisDigit
296 let lex_hex_escape remainder lexbuf =
297 let prefix = scan_hex_escape (Lexing.lexeme lexbuf) in
298 prefix :: remainder lexbuf
300 let lex_oct_escape remainder lexbuf =
301 let prefix = scan_oct_escape (Lexing.lexeme lexbuf) in
302 prefix :: remainder lexbuf
304 let lex_simple_escape remainder lexbuf =
305 let lexchar = Lexing.lexeme_char lexbuf 1 in
306 let prefix = scan_escape lexchar in
307 prefix :: remainder lexbuf
309 let lex_unescaped remainder lexbuf =
310 let prefix = Int64.of_int (Char.code (Lexing.lexeme_char lexbuf 0)) in
311 prefix :: remainder lexbuf
313 let lex_comment remainder lexbuf =
314 let ch = Lexing.lexeme_char lexbuf 0 in
315 let prefix = Int64.of_int (Char.code ch) in
316 if ch = '\n' then newline lexbuf;
317 prefix :: remainder lexbuf
319 let make_char (i:int64):char =
320 let min_val = Int64.zero in
321 let max_val = Int64.of_int 255 in
322 (* if i < 0 || i > 255 then error*)
323 if compare i min_val < 0 || compare i max_val > 0 then begin
324 let msg = Printf.sprintf "clexer:make_char: character 0x%Lx too big" i in
327 Char.chr (Int64.to_int i)
330 (* ISO standard locale-specific function to convert a wide character
331 * into a sequence of normal characters. Here we work on strings.
332 * We convert L"Hi" to "H\000i\000"
333 matth: this seems unused.
335 let len = String.length wstr in
336 let dest = String.make (len * 2) '\000' in
337 for i = 0 to len-1 do
338 dest.[i*2] <- wstr.[i] ;
343 (* This function converst the "Hi" in L"Hi" to { L'H', L'i', L'\0' }
344 matth: this seems unused.
345 let wstr_to_warray wstr =
346 let len = String.length wstr in
347 let res = ref "{ " in
348 for i = 0 to len-1 do
349 res := !res ^ (Printf.sprintf "L'%c', " wstr.[i])
357 let decdigit = ['0'-'9']
358 let octdigit = ['0'-'7']
359 let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F']
360 let letter = ['a'- 'z' 'A'-'Z']
363 let usuffix = ['u' 'U']
364 let lsuffix = "l"|"L"|"ll"|"LL"
365 let intsuffix = lsuffix | usuffix | usuffix lsuffix | lsuffix usuffix
369 let hexprefix = '0' ['x' 'X']
371 let intnum = decdigit+ intsuffix?
372 let octnum = '0' octdigit+ intsuffix?
373 let hexnum = hexprefix hexdigit+ intsuffix?
375 let exponent = ['e' 'E']['+' '-']? decdigit+
376 let fraction = '.' decdigit+
377 let decfloat = (intnum? fraction)
379 |(intnum? fraction exponent)
381 | (intnum '.' exponent)
383 let hexfraction = hexdigit* '.' hexdigit+ | hexdigit+
384 let binexponent = ['p' 'P'] ['+' '-']? decdigit+
385 let hexfloat = hexprefix hexfraction binexponent
386 | hexprefix hexdigit+ binexponent
388 let floatsuffix = ['f' 'F' 'l' 'L']
389 let floatnum = (decfloat | hexfloat) floatsuffix?
391 let ident = (letter|'_'|'$')(letter|decdigit|'_'|'$')*
392 let blank = [' ' '\t' '\012' '\r']+
394 let hex_escape = '\\' ['x' 'X'] hexdigit+
395 let oct_escape = '\\' octdigit octdigit? octdigit?
398 parse "/*" { comment lexbuf;
400 | "//" { onelinecomment lexbuf;
404 | blank { initial lexbuf}
405 | '\n' { newline lexbuf;
407 | '\\' '\r' * '\n' { newline lexbuf;
412 | "_Pragma" { PRAGMA (currentLoc lexbuf) }
414 | '\'' { CST_CHAR (chr lexbuf, currentLoc lexbuf)}
415 | "L'" { CST_WCHAR (chr lexbuf, currentLoc lexbuf) }
417 (* matth: BUG: this could be either a regular string or a wide string.
418 * e.g. if it's the "world" in
420 * then it should be treated as wide even though there's no L immediately
421 * preceding it. See test/small1/wchar5.c for a failure case. *)
422 CST_STRING (str lexbuf, currentLoc lexbuf) }
423 | "L\"" { (* weimer: wchar_t string literal *)
424 CST_WSTRING(str lexbuf, currentLoc lexbuf) }
425 | floatnum {CST_FLOAT (Lexing.lexeme lexbuf, currentLoc lexbuf)}
426 | hexnum {CST_INT (Lexing.lexeme lexbuf, currentLoc lexbuf)}
427 | octnum {CST_INT (Lexing.lexeme lexbuf, currentLoc lexbuf)}
428 | intnum {CST_INT (Lexing.lexeme lexbuf, currentLoc lexbuf)}
450 | "++" {PLUS_PLUS (currentLoc lexbuf)}
451 | "--" {MINUS_MINUS (currentLoc lexbuf)}
453 | '+' {PLUS (currentLoc lexbuf)}
454 | '-' {MINUS (currentLoc lexbuf)}
455 | '*' {STAR (currentLoc lexbuf)}
458 | '!' {EXCLAM (currentLoc lexbuf)}
459 | "&&" {AND_AND (currentLoc lexbuf)}
461 | '&' {AND (currentLoc lexbuf)}
466 | '~' {TILDE (currentLoc lexbuf)}
468 | '{' {LBRACE (currentLoc lexbuf)}
469 | '}' {RBRACE (currentLoc lexbuf)}
472 | '(' { (LPAREN (currentLoc lexbuf)) }
474 | ';' { (SEMICOLON (currentLoc lexbuf)) }
477 | "sizeof" {SIZEOF (currentLoc lexbuf)}
478 | "__asm" { if !msvcMode then
479 MSASM (msasm lexbuf, currentLoc lexbuf)
480 else (ASM (currentLoc lexbuf)) }
482 (* If we see __pragma we eat it and the matching parentheses as well *)
483 | "__pragma" { matchingParsOpen := 0;
484 let _ = matchingpars lexbuf in
488 (* __extension__ is a black. The parser runs into some conflicts if we let it
490 | "__extension__" {initial lexbuf }
491 | ident {scan_ident lexbuf (Lexing.lexeme lexbuf)}
493 | _ {parse_error "Invalid symbol"; raise Parsing.Parse_error }
498 | '\n' { newline lexbuf; comment lexbuf }
499 | _ { comment lexbuf }
502 and onelinecomment = parse
504 | _ { onelinecomment lexbuf }
506 and matchingpars = parse
507 '\n' { newline lexbuf; matchingpars lexbuf }
508 | blank { matchingpars lexbuf }
509 | '(' { incr matchingParsOpen; matchingpars lexbuf }
510 | ')' { decr matchingParsOpen;
511 if !matchingParsOpen = 0 then
516 | "/*" { comment lexbuf; matchingpars lexbuf}
518 let _ = str lexbuf in
521 | _ { matchingpars lexbuf }
523 (* # <line number> <file name> ... *)
525 '\n' { newline lexbuf; initial lexbuf}
526 | blank { hash lexbuf}
527 | intnum { (* We are seeing a line number. This is the number for the
529 let s = Lexing.lexeme lexbuf in
531 setCurrentLine lexbuf (int_of_string s - 1)
532 with Failure ("int_of_string") ->
533 (* the int is too big. *)
536 (* A file name may follow *)
538 | "line" { hash lexbuf } (* MSVC line number info *)
540 { let here = currentLoc lexbuf in
541 PRAGMA_LINE (pragma lexbuf, here)
543 | _ { endline lexbuf}
546 '\n' { newline lexbuf; initial lexbuf}
547 | blank { file lexbuf}
548 | '"' [^ '\012' '\t' '"']* '"' { (* '"' *)
549 let n = Lexing.lexeme lexbuf in
550 let n1 = String.sub n 1
551 ((String.length n) - 2) in
552 setCurrentFile lexbuf n1;
555 | _ { endline lexbuf}
558 '\n' { newline lexbuf; initial lexbuf}
560 | _ { endline lexbuf}
563 '\n' { newline lexbuf; "" }
564 | _ { let cur = Lexing.lexeme lexbuf in
565 cur ^ (pragma lexbuf) }
568 '"' {[]} (* no nul terminiation in CST_STRING '"' *)
569 | hex_escape { lex_hex_escape str lexbuf}
570 | oct_escape { lex_oct_escape str lexbuf}
571 | escape { lex_simple_escape str lexbuf}
572 | _ { lex_unescaped str lexbuf}
576 | hex_escape {lex_hex_escape chr lexbuf}
577 | oct_escape {lex_oct_escape chr lexbuf}
578 | escape {lex_simple_escape chr lexbuf}
579 | _ {lex_unescaped chr lexbuf}
582 blank { msasm lexbuf }
583 | '{' { msasminbrace lexbuf }
584 | _ { let cur = Lexing.lexeme lexbuf in
585 cur ^ (msasmnobrace lexbuf) }
587 and msasminbrace = parse
589 | _ { let cur = Lexing.lexeme lexbuf in
590 cur ^ (msasminbrace lexbuf) }
591 and msasmnobrace = parse
592 ['}' ';' '\n'] { lexbuf.Lexing.lex_curr_pos <-
593 lexbuf.Lexing.lex_curr_pos - 1;
595 | "__asm" { lexbuf.Lexing.lex_curr_pos <-
596 lexbuf.Lexing.lex_curr_pos - 5;
598 | _ { let cur = Lexing.lexeme lexbuf in
600 cur ^ (msasmnobrace lexbuf) }