2 * ----------------------------------------------------------------------
11 * The rules are translated to:
13 * let rec parse_<rule1> ... = ...
14 * and parse_<rule2> ... = ...
16 * and parse_<ruleN> ... = ...
19 * Every rule has at least two arguments: 'current' and 'get_next'.
20 * 'current()' is the token that should match the first symbol of the
21 * rule. 'get_next()' returns the next token.
23 * The rules may have further user arguments; these are the next arguments
26 * The rules return the user value. After they have returned to the caller
27 * the current token is the token that follows the sequence of tokens
30 * The rules will raise:
31 * - Not_found if the first token does not match
32 * - Parsing.Parse_error if the rest does not match.
36 * rule(arg1,arg2,...):
39 * (l2:y2(name1,...)) y3 ...
46 * let parse_<rule> current get_next arg1 arg2 ... =
47 * match current() with
52 * | _ -> raise Not_found
54 * Here, S(xi) denotes the set of tokens matched by xi without all tokens
55 * already matched by x1 to x(i-1). (If S(xi) = empty, a warning is printed,
56 * and this branch of the rule is omitted.)
58 * S(xi) may be a set because xi may be a reference to another rule. In this
59 * case, S(xi) bases on the set of tokens that match the first symbol of
60 * the other rule. (In general, S(xi) must be computed recursively.)
62 * If the "?" clause is present, every branch is embraced by the following:
64 * let position = ref "<Label of x1>" in
66 * with Parsing.Parse_error -> ( <<?-CODE>> )
71 * OPTIONAL: let <l1> = parse_<rule(x1)> in
79 * If x1 is a rule invocation, it is now parsed, and the result is bound
82 * Note: After x1 has matched, the Caml variable <l1> must be either
83 * bound to the result of the sub parsing, or to the value associated
84 * with the token (if any). The latter is already done in the main
85 * "match" statement, i.e. "match ... with S(x1) -> ..." is actually
86 * "match ... with Token1 <l1> -> ...".
88 * Note: After calling parse_<rule(x1)> the exception Not_found is NEVER
89 * converted to Parsing.Parse_error. It is simply not possible that this
92 * For every remaining symbol yi of the rule, a matching statement M(yi)
93 * is produced. These statements have the form:
95 * OPTIONAL: position := "<Label of yi>";
96 * CASE: yi is a token without associated value
97 * let yy_i = get_next() OR current() in
98 * if yy_i <> Token(yi) then raise Parsing.Parse_error;
99 * CASE: yi is a token with value
100 * let yy_i = get_next() OR current() in
101 * let <li> = match yy_i with Token x -> x | _ -> raise Parsing.Parse_error
103 * CASE: yi is a rule invocation
104 * OPTIONAL: let _ = get_next() in
105 * let <li> = try parse_<rule(yi)>
106 * with Not_found -> raise Parsing.Parse_error in
108 * yy_i is get_next() if y(i-1) was a token, and yy_i is current() if
109 * y(i-1) was a rule invocation.
115 * CASE no label given:
119 * M(yi') with the modification that top-level mismatches raise
120 * Not_found instead of Parsing.Parse_error
122 * with Not_found -> ()
125 * CASE a label <li> is given: The list of results must be bound to <li>!
127 * let yy_list = ref [] in
130 * let yy_first = M(yi') (with some modifications) in
131 * yy_list := yy_first :: !yy_list;
133 * with Not_found -> ()
135 * let <li> = List.rev !yy_list in
137 * Note that this scheme minimizes stack and heap allocations.
143 * CASE no label given:
146 * M(yi') with the modification that top-level mismatches raise
147 * Not_found instead of Parsing.Parse_error
148 * with Not_found -> ()
151 * CASE a label <li> is given: The optional result must be bound to <li>!
155 * Some( M(yi') (with some modifications) )
156 * with Not_found -> None
161 let lookup_rule tree name =
163 List.find (fun r -> r.rule_name = name) tree.text_rules
166 failwith ("Rule `" ^ name ^ "' not found")
170 let is_typed tree name =
171 (* Find out whether the token 'name' is typed or not *)
174 List.find (fun d -> match d with
175 D_token n -> n = name
176 | D_typed_token n -> n = name
181 failwith ("Token `" ^ name ^ "' not found")
185 | D_typed_token _ -> true
189 let label_of_symbol tree sym =
191 U_symbol (tok, lab) ->
192 (* if is_typed tree tok then lab else None *)
194 | L_symbol (_, _, lab) -> lab
195 | L_indirect (_, _, lab) -> lab
199 let is_untyped_U_symbol tree sym =
202 not(is_typed tree tok)
203 | L_symbol (_, _, _) -> false
204 | L_indirect (_, _, _) -> false
209 let rec set_of_list l =
210 (* Removes duplicate members of l *)
213 | x :: l' -> if List.mem x l' then set_of_list l' else x :: (set_of_list l')
217 let selector_set_of_rule tree name =
218 (* Determines the set of tokens that match the first symbol of a rule *)
220 let rec collect visited_rules name =
221 if List.mem name visited_rules then
224 let r = lookup_rule tree name in
228 match branch.branch_selector with
229 U_symbol (tok_name,_) ->
231 | L_symbol (rule_name, _, _) ->
232 collect (name :: visited_rules) rule_name
233 | L_indirect (_, _, _) ->
234 failwith("The first symbol in rule `" ^ name ^
235 "' is an indirect call; this is not allowed")
240 set_of_list (collect [] name)
244 let output_code_location b file_name (_, line, column) =
245 Buffer.add_string b "\n";
246 Buffer.add_string b ("# " ^ string_of_int line ^ " \"" ^
248 Buffer.add_string b (String.make column ' ')
252 let phantasy_line = ref 100000;;
254 let output_code b file_name ((code, line, column) as triple) =
255 if code <> "" then begin
256 output_code_location b file_name triple;
257 Buffer.add_string b code;
258 Buffer.add_string b ("\n# " ^ string_of_int !phantasy_line ^ " \"<Generated Code>\"\n");
259 phantasy_line := !phantasy_line + 10000;
264 let process_branch b file_name tree branch =
266 let make_rule_invocation called_rule args lab allow_not_found =
267 (* Produces: let <label> = parse_<called_rule> ... args in
268 * If not allow_not_found, the exception Not_found is caught and
269 * changed into Parsing.Parse_error.
271 let r = lookup_rule tree called_rule in
272 if List.length r.rule_arguments <> List.length args then
273 failwith("Calling rule `" ^ called_rule ^ "' with the wrong number of arguments!");
275 Buffer.add_string b "let ";
277 None -> Buffer.add_string b "_"
278 | Some l -> Buffer.add_string b l
280 Buffer.add_string b " = ";
281 if not allow_not_found then
282 Buffer.add_string b "try ";
283 Buffer.add_string b "parse_";
284 Buffer.add_string b called_rule;
285 Buffer.add_string b " yy_current yy_get_next";
287 (fun a -> Buffer.add_string b " ";
288 Buffer.add_string b a;
291 if not allow_not_found then
292 Buffer.add_string b " with Not_found -> raise Parsing.Parse_error";
293 Buffer.add_string b " in\n"
296 let make_indirect_rule_invocation ml_name args lab allow_not_found =
297 (* Produces: let <label> = ml_name ... args in
298 * If not allow_not_found, the exception Not_found is caught and
299 * changed into Parsing.Parse_error.
301 Buffer.add_string b "let ";
303 None -> Buffer.add_string b "_"
304 | Some l -> Buffer.add_string b l
306 Buffer.add_string b " = ";
307 if not allow_not_found then
308 Buffer.add_string b "try ";
309 Buffer.add_string b ml_name;
310 Buffer.add_string b " yy_current yy_get_next";
312 (fun a -> Buffer.add_string b " ";
313 Buffer.add_string b a;
316 if not allow_not_found then
317 Buffer.add_string b " with Not_found -> raise Parsing.Parse_error";
318 Buffer.add_string b " in\n"
321 let process_symbol sym previous_was_token allow_not_found =
323 U_symbol(tok, lab) ->
324 (* Distinguish between simple tokens and typed tokens *)
325 if is_typed tree tok then begin
327 Buffer.add_string b "let ";
329 None -> Buffer.add_string b "_"
330 | Some l -> Buffer.add_string b l
332 Buffer.add_string b " = match ";
333 if previous_was_token then
334 Buffer.add_string b "yy_get_next()"
336 Buffer.add_string b "yy_current()";
337 Buffer.add_string b " with ";
338 Buffer.add_string b tok;
339 Buffer.add_string b " x -> x | _ -> raise ";
340 if allow_not_found then
341 Buffer.add_string b "Not_found"
343 Buffer.add_string b "Parsing.Parse_error";
344 Buffer.add_string b " in\n";
348 Buffer.add_string b "if (";
349 if previous_was_token then
350 Buffer.add_string b "yy_get_next()"
352 Buffer.add_string b "yy_current()";
353 Buffer.add_string b ") <> ";
354 Buffer.add_string b tok;
355 Buffer.add_string b " then raise ";
356 if allow_not_found then
357 Buffer.add_string b "Not_found;\n"
359 Buffer.add_string b "Parsing.Parse_error;\n"
361 | L_symbol(called_rule, args, lab) ->
362 if previous_was_token then
363 Buffer.add_string b "ignore(yy_get_next());\n";
364 make_rule_invocation called_rule args lab allow_not_found
365 | L_indirect(ml_name, args, lab) ->
366 if previous_was_token then
367 Buffer.add_string b "ignore(yy_get_next());\n";
368 make_indirect_rule_invocation ml_name args lab allow_not_found
371 let process_pattern (current_position, previous_was_token) pat =
372 (* Assign "position" if necessary. *)
374 if branch.branch_error_code <> None then begin
375 match pat.pat_symbol with
376 U_symbol(_,Some l) -> l
377 | L_symbol(_,_,Some l) -> l
378 | L_indirect(_,_,Some l) -> l
383 if new_position <> current_position then begin
384 Buffer.add_string b "yy_position := \"";
385 Buffer.add_string b new_position;
386 Buffer.add_string b "\";\n";
390 match pat.pat_symbol with
391 U_symbol(_,_) -> pat.pat_modifier = Exact
392 | L_symbol(_,_,_) -> false
393 | L_indirect(_,_,_) -> false
396 (* First distinguish between Exact, Option, and Repetition: *)
397 begin match pat.pat_modifier with
399 process_symbol pat.pat_symbol previous_was_token false
401 begin match label_of_symbol tree pat.pat_symbol with
403 (* CASE: optional symbol without label *)
404 (* OPTIMIZATION: If the symbol is
405 * a token, the loop becomes very simple.
407 if (match pat.pat_symbol with
408 U_symbol(t,_) -> not (is_typed tree t) | _ -> false)
410 let tok = match pat.pat_symbol with
411 U_symbol(t,_) -> t | _ -> assert false in
413 Buffer.add_string b "if ";
414 if previous_was_token then
415 Buffer.add_string b "yy_get_next()"
417 Buffer.add_string b "yy_current()";
418 Buffer.add_string b " = ";
419 Buffer.add_string b tok;
420 Buffer.add_string b " then ignore(yy_get_next());\n";
423 (* General, non-optimized case: *)
424 Buffer.add_string b "( try (";
425 process_symbol pat.pat_symbol previous_was_token true;
426 Buffer.add_string b "ignore(yy_get_next());\n";
427 Buffer.add_string b ") with Not_found -> ());\n";
430 (* CASE: optional symbol with label *)
431 if is_untyped_U_symbol tree pat.pat_symbol then begin
432 (* SUBCASE: The label becomes a boolean variable *)
433 Buffer.add_string b "let ";
434 Buffer.add_string b l;
435 Buffer.add_string b " = try (";
436 process_symbol pat.pat_symbol previous_was_token true;
437 Buffer.add_string b ");\n";
438 Buffer.add_string b "ignore(yy_get_next());\n";
439 Buffer.add_string b "true with Not_found -> false in\n";
442 (* SUBCASE: the symbol has a value *)
443 Buffer.add_string b "let ";
444 Buffer.add_string b l;
445 Buffer.add_string b " = try let yy_tok = Some(";
446 process_symbol pat.pat_symbol previous_was_token true;
447 Buffer.add_string b l;
448 Buffer.add_string b ") in\n";
450 if (match pat.pat_symbol with
451 U_symbol(_,_) -> true | _ -> false) then
452 Buffer.add_string b "ignore(yy_get_next());\n";
454 Buffer.add_string b "yy_tok with Not_found -> None in\n";
458 begin match label_of_symbol tree pat.pat_symbol with
460 (* CASE: repeated symbol without label *)
461 (* OPTIMIZATION: If the symbol is
462 * a token, the loop becomes very simple.
464 if (match pat.pat_symbol with
465 U_symbol(t,_) -> not (is_typed tree t) | _ -> false)
467 let tok = match pat.pat_symbol with
468 U_symbol(t,_) -> t | _ -> assert false in
469 if previous_was_token then begin
470 (* Optimized case I *)
471 Buffer.add_string b "while yy_get_next() = ";
472 Buffer.add_string b tok;
473 Buffer.add_string b " do () done;\n";
476 (* Optimized case II *)
477 Buffer.add_string b "if yy_current() = ";
478 Buffer.add_string b tok;
479 Buffer.add_string b " then (";
480 Buffer.add_string b "while yy_get_next() = ";
481 Buffer.add_string b tok;
482 Buffer.add_string b " do () done);\n";
486 (* General, non-optimized case: *)
487 if previous_was_token then
488 Buffer.add_string b "ignore(yy_get_next());\n";
489 Buffer.add_string b "( try while true do (";
490 process_symbol pat.pat_symbol false true;
492 if (match pat.pat_symbol with
493 U_symbol(_,_) -> true | _ -> false) then
494 Buffer.add_string b "ignore(yy_get_next());\n"
496 Buffer.add_string b "();\n";
498 Buffer.add_string b ") done with Not_found -> ());\n";
501 (* CASE: repeated symbol with label *)
502 if is_untyped_U_symbol tree pat.pat_symbol then begin
503 (* SUBCASE: The label becomes an integer variable *)
504 if previous_was_token then
505 Buffer.add_string b "ignore(yy_get_next());\n";
506 Buffer.add_string b "let yy_counter = ref 0 in\n";
507 Buffer.add_string b "( try while true do \n";
508 process_symbol pat.pat_symbol false true;
509 Buffer.add_string b "incr yy_counter;\n";
511 if (match pat.pat_symbol with
512 U_symbol(_,_) -> true | _ -> false) then
513 Buffer.add_string b "ignore(yy_get_next());\n";
515 Buffer.add_string b "done with Not_found -> ());\n";
516 Buffer.add_string b "let ";
517 Buffer.add_string b l;
518 Buffer.add_string b " = !yy_counter in\n";
521 (* SUBCASE: the symbol has a value *)
522 if previous_was_token then
523 Buffer.add_string b "ignore(yy_get_next());\n";
524 Buffer.add_string b "let yy_list = ref [] in\n";
525 Buffer.add_string b "( try while true do \n";
526 process_symbol pat.pat_symbol false true;
527 Buffer.add_string b "yy_list := ";
528 Buffer.add_string b l;
529 Buffer.add_string b " :: !yy_list;\n";
531 if (match pat.pat_symbol with
532 U_symbol(_,_) -> true | _ -> false) then
533 Buffer.add_string b "ignore(yy_get_next());\n";
535 Buffer.add_string b "done with Not_found -> ());\n";
536 Buffer.add_string b "let ";
537 Buffer.add_string b l;
538 Buffer.add_string b " = List.rev !yy_list in\n";
544 (new_position, this_is_token)
548 let process_inner_branch current_position =
549 (* If there is "early code", run this now: *)
550 output_code b file_name branch.branch_early_code;
551 Buffer.add_string b "\n";
553 (* If the first symbol is a rule invocation, call the corresponding
554 * parser function now.
556 let previous_was_token =
557 begin match branch.branch_selector with
560 | L_symbol(called_rule, args, lab) ->
561 make_rule_invocation called_rule args lab true;
563 | L_indirect(_,_,_) ->
564 failwith("The first symbol in some rule is an indirect call; this is not allowed")
568 (* Now output the "let-CODE". *)
569 output_code b file_name branch.branch_binding_code;
570 Buffer.add_string b "\n";
572 (* Process the other symbols in turn: *)
573 let (_, previous_was_token') =
576 (current_position, previous_was_token)
577 branch.branch_pattern
583 * If previous_was_token', we must invoke yy_get_next one more time.
584 * This is deferred until "CODE" is executed to give this code
585 * the chance to make the next token available (in XML, the next token
586 * might come from a different entity, and "CODE" must switch to this
590 (* Now output "CODE": *)
591 Buffer.add_string b "let result = \n";
592 output_code b file_name branch.branch_result_code;
593 Buffer.add_string b "\nin\n";
595 if previous_was_token' then
596 Buffer.add_string b "ignore(yy_get_next());\nresult\n"
598 Buffer.add_string b "result\n"
601 (* If we have a ? clause, generate now the "try" statement *)
602 match branch.branch_error_code with
604 Buffer.add_string b "( ";
605 process_inner_branch "";
606 Buffer.add_string b " )";
609 (* let position = ref "<label>" in *)
611 Buffer.add_string b "let yy_position = ref \"";
612 let current_position =
613 match branch.branch_selector with
615 | L_symbol(_,_,None) -> ""
616 | L_symbol(_,_,Some l) -> l
617 | L_indirect(_,_,None) -> ""
618 | L_indirect(_,_,Some l) -> l
620 Buffer.add_string b current_position;
621 Buffer.add_string b "\" in\n";
623 (* The "try" statement: *)
625 Buffer.add_string b "( try (\n";
627 process_inner_branch current_position;
629 Buffer.add_string b "\n) with Parsing.Parse_error -> (\n";
630 output_code b file_name code;
631 Buffer.add_string b "\n))\n"
635 let process b file_name tree =
636 (* Iterate over the rules and output the parser functions: *)
637 let is_first = ref true in
641 (* Generate the function header: *)
644 Buffer.add_string b "let rec "
646 Buffer.add_string b "and ";
648 Buffer.add_string b "parse_";
649 Buffer.add_string b r.rule_name;
650 Buffer.add_string b " yy_current yy_get_next";
652 (fun arg -> Buffer.add_string b " ";
653 Buffer.add_string b arg)
655 Buffer.add_string b " =\n";
657 (* Generate the "match" statement: *)
659 Buffer.add_string b "match yy_current() with\n";
660 let s_done = ref [] in
661 (* s_done: The set of already matched tokens *)
665 match branch.branch_selector with
666 U_symbol(tok, lab) ->
668 if List.mem tok !s_done then begin
669 prerr_endline("WARNING: In rule `" ^ r.rule_name ^
670 "': Match for token `" ^
671 tok ^ "' hidden by previous match");
674 if is_typed tree tok then begin
677 Buffer.add_string b "| ";
678 Buffer.add_string b tok;
679 Buffer.add_string b " _ -> ";
680 process_branch b file_name tree branch;
681 Buffer.add_string b "\n";
682 s_done := tok :: !s_done;
684 Buffer.add_string b "| ";
685 Buffer.add_string b tok;
686 Buffer.add_string b " ";
687 Buffer.add_string b l;
688 Buffer.add_string b " -> ";
689 process_branch b file_name tree branch;
690 Buffer.add_string b "\n";
691 s_done := tok :: !s_done;
694 Buffer.add_string b "| ";
695 Buffer.add_string b tok;
696 Buffer.add_string b " -> ";
697 process_branch b file_name tree branch;
698 Buffer.add_string b "\n";
699 s_done := tok :: !s_done;
701 | L_symbol(called_rule, args, lab) ->
702 (* An invocation of a rule *)
703 let s_rule = selector_set_of_rule tree called_rule in
707 if List.mem tok !s_done then begin
708 prerr_endline("WARNING: In rule `" ^ r.rule_name ^
709 "': Match for token `" ^
710 tok ^ "' hidden by previous match");
715 if s_rule' <> [] then begin
716 Buffer.add_string b "| ( ";
717 let is_first = ref true in
720 if not !is_first then
721 Buffer.add_string b " | ";
723 Buffer.add_string b tok;
724 if is_typed tree tok then
725 Buffer.add_string b " _";
728 Buffer.add_string b ") -> ";
729 process_branch b file_name tree branch;
730 Buffer.add_string b "\n";
731 s_done := s_rule' @ !s_done;
733 | L_indirect(ml_name, args, lab) ->
734 (* An invocation of an indirect rule *)
735 failwith("The first symbol in rule `" ^ r.rule_name ^
736 "' is an indirect call; this is not allowed")
740 Buffer.add_string b "\n| _ -> raise Not_found\n";
744 Buffer.add_string b " in\n"
749 (* returns number of lines in s, number of columns of the last line *)
750 let l = String.length s in
752 let rec count n k no_cr no_lf =
757 try String.index_from s k '\013' with Not_found -> (-1) in
762 try String.index_from s k '\010' with Not_found -> (-1) in
763 if next_cr >= 0 & (next_lf < 0 or next_cr < next_lf) then begin
764 if next_cr+1 < l & s.[next_cr+1] = '\010' then
765 count (n+1) (next_cr+2) false (next_lf < 0)
767 count (n+1) (next_cr+1) false (next_lf < 0)
769 else if next_lf >= 0 then begin
770 count (n+1) (next_lf+1) (next_cr < 0) false
776 count 0 0 false false
781 { mutable old_line : int;
782 mutable old_column : int;
784 mutable column : int;
789 let rec next_token context lexbuf =
790 let t = Lexer.scan_file lexbuf in
791 let line = context.line in
792 let column = context.column in
793 context.old_line <- line;
794 context.old_column <- column;
795 let n_lines, n_columns = count_lines (Lexing.lexeme lexbuf) in
796 if n_lines > 0 then begin
797 context.line <- line + n_lines;
798 context.column <- n_columns;
801 context.column <- column + n_columns;
803 Space -> next_token context lexbuf
804 | Code(s,_,_) -> Code(s,line,column + 2)
805 | Eof -> failwith "Unexpected end of file"
810 let parse_and_generate ch =
811 let b = Buffer.create 20000 in
813 let rec find_sep context lexbuf =
814 let t = Lexer.scan_header lexbuf in
815 let line = context.line in
816 let column = context.column in
817 context.old_line <- line;
818 context.old_column <- column;
819 let n_lines, n_columns = count_lines (Lexing.lexeme lexbuf) in
820 if n_lines > 0 then begin
821 context.line <- line + n_lines;
822 context.column <- n_columns;
825 context.column <- column + n_columns;
828 Buffer.add_string b s;
829 find_sep context lexbuf
830 | Eof -> failwith "Unexpected end of file"
835 let rec find_rest context lexbuf =
836 let t = Lexer.scan_header lexbuf in
837 let line = context.line in
838 let column = context.column in
839 context.old_line <- line;
840 context.old_column <- column;
841 let n_lines, n_columns = count_lines (Lexing.lexeme lexbuf) in
842 if n_lines > 0 then begin
843 context.line <- line + n_lines;
844 context.column <- n_columns;
847 context.column <- column + n_columns;
850 Buffer.add_string b s;
851 find_rest context lexbuf
856 (* First read until '%%' *)
857 let lexbuf = Lexing.from_channel ch in
858 let context = { old_line = 0; old_column = 0; line = 1; column = 0 } in
859 let file_name = "stdin" in
861 output_code_location b file_name ("", 1, 0);
862 find_sep context lexbuf;
863 (* Parse the following text *)
864 let text = (Parser.text (next_token context) lexbuf : Ast.text) in
866 process b file_name text;
868 output_code_location b file_name ("", context.line, context.column);
869 find_rest context lexbuf;
870 (* Output everything: *)
871 print_string (Buffer.contents b)
875 "Error at line %d column %d: %s\n"
878 (Printexc.to_string any);
883 parse_and_generate stdin;;
886 (* ======================================================================
890 * Revision 1.1 2000/11/17 09:57:32 lpadovan
893 * Revision 1.7 2000/08/17 00:33:02 gerd
894 * Bugfix: tok* and tok? work now if tok is an untyped token
897 * Revision 1.6 2000/05/14 20:59:24 gerd
898 * Added "phantasy line numbers" to help finding errorneous locations.
900 * Revision 1.5 2000/05/14 20:41:58 gerd
901 * x: Token? means: if Token is detected x=true else x=false.
902 * x: Token* means: x becomes the number of ocurrences of Token.
904 * Revision 1.4 2000/05/09 00:03:22 gerd
905 * Added [ ml_name ] symbols, where ml_name is an arbitrary
908 * Revision 1.3 2000/05/08 22:03:01 gerd
909 * It is now possible to have a $ {{ }} sequence right BEFORE
910 * the first token. This code is executed just after the first token
911 * has been recognized.
913 * Revision 1.2 2000/05/06 21:51:08 gerd
916 * Revision 1.1 2000/05/06 17:36:17 gerd