From 71adb7c2f7f84e6bfe523cf066a65cc14cc9217b Mon Sep 17 00:00:00 2001 From: Stefano Zacchiroli Date: Wed, 21 Sep 2005 14:39:30 +0000 Subject: [PATCH] added ligatures support --- helm/ocaml/cic_notation/cicNotation.ml | 4 +- helm/ocaml/cic_notation/cicNotationLexer.ml | 79 ++++++++++++++++--- helm/ocaml/cic_notation/cicNotationLexer.mli | 8 ++ helm/ocaml/cic_notation/cicNotationParser.ml | 31 +++++--- helm/ocaml/cic_notation/cicNotationParser.mli | 10 ++- helm/ocaml/cic_notation/grafiteParser.ml | 20 +++-- helm/ocaml/cic_notation/grafiteParser.mli | 4 +- helm/ocaml/cic_notation/test_dep.ml | 5 +- helm/ocaml/cic_notation/test_lexer.ml | 4 +- helm/ocaml/cic_notation/test_parser.ml | 2 +- 10 files changed, 124 insertions(+), 43 deletions(-) diff --git a/helm/ocaml/cic_notation/cicNotation.ml b/helm/ocaml/cic_notation/cicNotation.ml index f6ea55a48..9ed0c5038 100644 --- a/helm/ocaml/cic_notation/cicNotation.ml +++ b/helm/ocaml/cic_notation/cicNotation.ml @@ -61,10 +61,10 @@ let remove_notation = function let load_notation fname = let ic = open_in fname in - let istream = Stream.of_channel ic in + let lexbuf = Ulexing.from_utf8_channel ic in try while true do - match GrafiteParser.parse_statement istream with + match GrafiteParser.parse_statement lexbuf with | Executable (_, Command (_, cmd)) -> ignore (process_notation cmd) | _ -> () done diff --git a/helm/ocaml/cic_notation/cicNotationLexer.ml b/helm/ocaml/cic_notation/cicNotationLexer.ml index 9c0efc8bb..db0eb3de8 100644 --- a/helm/ocaml/cic_notation/cicNotationLexer.ml +++ b/helm/ocaml/cic_notation/cicNotationLexer.ml @@ -34,6 +34,9 @@ let regexp number = xml_digit+ let regexp ident_letter = [ 'a' - 'z' 'A' - 'Z' ] +let regexp ligature_char = [ "'`~!?@*()[]<>-+=|:;.,/\"" ] +let regexp ligature = ligature_char ligature_char+ + let regexp ident_decoration = '\'' | '!' | '?' | '`' let regexp ident_cont = ident_letter | xml_digit | '_' let regexp ident = ident_letter ident_cont* ident_decoration* @@ -98,6 +101,17 @@ let _ = let add_level2_ast_keyword k = Hashtbl.add level2_ast_keywords k () let remove_level2_ast_keyword k = Hashtbl.remove level2_ast_keywords k + (* (string, int) Hashtbl.t, with multiple bindings. + * int is the unicode codepoint *) +let ligatures = Hashtbl.create 23 +let _ = + List.iter + (fun (ligature, symbol) -> Hashtbl.add ligatures ligature symbol) + [ ("->", <:unicode>); ("=>", <:unicode>); + ("<=", <:unicode>); (">=", <:unicode>); + ("<>", <:unicode>); (":=", <:unicode>); + ] + let regexp uri = ("cic:/" | "theory:/") (* schema *) ident ('/' ident)* (* path *) @@ -127,11 +141,26 @@ let return lexbuf token = let return_lexeme lexbuf name = return lexbuf (name, Ulexing.utf8_lexeme lexbuf) +let return_symbol lexbuf s = return lexbuf ("SYMBOL", s) +let return_eoi lexbuf = return lexbuf ("EOI", "") + let remove_quotes s = String.sub s 1 (String.length s - 2) -let mk_lexer token = +let mk_lexer ?(use_ligatures = false) token = let tok_func stream = - let lexbuf = Ulexing.from_utf8_stream stream in +(* let lexbuf = Ulexing.from_utf8_stream stream in *) +(** XXX Obj.magic rationale. + * The problem. + * camlp4 constraints the tok_func field of Token.glexer to have type: + * Stream.t char -> (Stream.t 'te * flocation_function) + * In order to use ulex we have (in theory) to instantiate a new lexbuf each + * time a char Stream.t is passed, destroying the previous lexbuf which may + * have consumed a character from the old stream which is lost forever :-( + * The "solution". + * Instead of passing to camlp4 a char Stream.t we pass a lexbuf, casting it to + * char Stream.t with Obj.magic where needed. + *) + let lexbuf = Obj.magic stream in Token.make_stream_and_flocation (fun () -> try @@ -161,7 +190,8 @@ let expand_macro lexbuf = let remove_quotes s = String.sub s 1 (String.length s - 2) let remove_left_quote s = String.sub s 1 (String.length s - 1) -let rec level2_pattern_token_group counter buffer = lexer +let rec level2_pattern_token_group counter buffer = + lexer | end_group -> if (counter > 0) then Buffer.add_string buffer (Ulexing.utf8_lexeme lexbuf) ; @@ -180,7 +210,8 @@ let read_unparsed_group token_name lexbuf = let end_cnum = level2_pattern_token_group 0 buffer lexbuf in return_with_loc (token_name, Buffer.contents buffer) begin_cnum end_cnum -let rec level2_meta_token = lexer +let rec level2_meta_token = + lexer | xml_blank+ -> level2_meta_token lexbuf | ident -> let s = Ulexing.utf8_lexeme lexbuf in @@ -197,10 +228,28 @@ let rec level2_meta_token = lexer | ast_csymbol -> return lexbuf ("UNPARSED_AST", remove_left_quote (Ulexing.utf8_lexeme lexbuf)) - | eof -> return lexbuf ("EOI", "") + | eof -> return_eoi lexbuf -let rec level2_ast_token = lexer - | xml_blank+ -> level2_ast_token lexbuf + (** @param k continuation to be invoked when no ligature has been found *) +let rec ligatures_token k = + lexer + | ligature -> + let lexeme = Ulexing.utf8_lexeme lexbuf in + (match Hashtbl.find_all ligatures lexeme with + | [] -> (* ligature not found, rollback and try default lexer *) + Ulexing.rollback lexbuf; + k lexbuf + | ligs -> (* ligatures found, use the default one *) + let default_lig = List.hd (List.rev ligs) in + return_symbol lexbuf default_lig) + | eof -> return_eoi lexbuf + | _ -> (* not a ligature, rollback and try default lexer *) + Ulexing.rollback lexbuf; + k lexbuf + +and level2_ast_token = + lexer + | xml_blank+ -> ligatures_token level2_ast_token lexbuf | meta -> return lexbuf ("META", Ulexing.utf8_lexeme lexbuf) | implicit -> return lexbuf ("IMPLICIT", "") | placeholder -> return lexbuf ("PLACEHOLDER", "") @@ -232,11 +281,12 @@ let rec level2_ast_token = lexer return lexbuf ("NOTE", comment) | begincomment -> return lexbuf ("BEGINCOMMENT","") | endcomment -> return lexbuf ("ENDCOMMENT","") - | eof -> return lexbuf ("EOI", "") - | _ -> return lexbuf ("SYMBOL", Ulexing.utf8_lexeme lexbuf) + | eof -> return_eoi lexbuf + | _ -> return_symbol lexbuf (Ulexing.utf8_lexeme lexbuf) -let rec level1_pattern_token = lexer - | xml_blank+ -> level1_pattern_token lexbuf +and level1_pattern_token = + lexer + | xml_blank+ -> ligatures_token level1_pattern_token lexbuf | number -> return lexbuf ("NUMBER", Ulexing.utf8_lexeme lexbuf) | ident -> let s = Ulexing.utf8_lexeme lexbuf in @@ -251,8 +301,11 @@ let rec level1_pattern_token = lexer return lexbuf ("QKEYWORD", remove_quotes (Ulexing.utf8_lexeme lexbuf)) | '(' -> return lexbuf ("LPAREN", "") | ')' -> return lexbuf ("RPAREN", "") - | eof -> return lexbuf ("EOI", "") - | _ -> return lexbuf ("SYMBOL", Ulexing.utf8_lexeme lexbuf) + | eof -> return_eoi lexbuf + | _ -> return_symbol lexbuf (Ulexing.utf8_lexeme lexbuf) + +let level1_pattern_token = ligatures_token level1_pattern_token +let level2_ast_token = ligatures_token level2_ast_token (* API implementation *) diff --git a/helm/ocaml/cic_notation/cicNotationLexer.mli b/helm/ocaml/cic_notation/cicNotationLexer.mli index 5eb22a99c..765a554f4 100644 --- a/helm/ocaml/cic_notation/cicNotationLexer.mli +++ b/helm/ocaml/cic_notation/cicNotationLexer.mli @@ -28,10 +28,18 @@ * error message *) exception Error of int * int * string + (** XXX ZACK DEFCON 4 BEGIN: never use the tok_func field of the glexers below + * passing values of type char Stream.t, they should be in fact Ulexing.lexbuf + * casted with Obj.magic :-/ Read the comment in the .ml for the rationale *) + val level1_pattern_lexer: (string * string) Token.glexer val level2_ast_lexer: (string * string) Token.glexer val level2_meta_lexer: (string * string) Token.glexer + (** XXX ZACK DEFCON 4 END *) + val add_level2_ast_keyword: string -> unit (** non idempotent *) val remove_level2_ast_keyword: string -> unit (** non idempotent *) +(* val lookup_ligatures: string -> string list *) + diff --git a/helm/ocaml/cic_notation/cicNotationParser.ml b/helm/ocaml/cic_notation/cicNotationParser.ml index 45ff3d233..bcfd6c9f6 100644 --- a/helm/ocaml/cic_notation/cicNotationParser.ml +++ b/helm/ocaml/cic_notation/cicNotationParser.ml @@ -78,7 +78,7 @@ let make_action action bindings = function [] -> Gramext.action (fun (loc: Ast.location) -> action vl loc) | NoBinding :: tl -> Gramext.action (fun _ -> aux vl tl) - (* LUCA: DEFCON 4 BEGIN *) + (* LUCA: DEFCON 3 BEGIN *) | Binding (name, Env.TermType) :: tl -> Gramext.action (fun (v:Ast.term) -> @@ -101,7 +101,7 @@ let make_action action bindings = aux ((name, (Env.ListType t, Env.ListValue v)) :: vl) tl) | Env _ :: tl -> Gramext.action (fun (v:CicNotationEnv.t) -> aux (v @ vl) tl) - (* LUCA: DEFCON 4 END *) + (* LUCA: DEFCON 3 END *) in aux [] (List.rev bindings) @@ -411,7 +411,8 @@ EXTEND level2_meta: [ [ magic = l2_magic -> Ast.Magic magic | var = l2_variable -> Ast.Variable var - | blob = UNPARSED_AST -> !parse_level2_ast_ref (Stream.of_string blob) + | blob = UNPARSED_AST -> + !parse_level2_ast_ref (Ulexing.from_utf8_string blob) ] ]; END @@ -470,7 +471,7 @@ EXTEND List.map (fun n -> Ast.Ident (n, None)) names, Some ty | name = IDENT -> [Ast.Ident (name, None)], None | blob = UNPARSED_META -> - let meta = !parse_level2_meta_ref (Stream.of_string blob) in + let meta = !parse_level2_meta_ref (Ulexing.from_utf8_string blob) in match meta with | Ast.Variable (Ast.FreshVar _) -> [meta], None | Ast.Variable (Ast.TermVar "_") -> [Ast.Ident ("_", None)], None @@ -480,7 +481,7 @@ EXTEND single_arg: [ [ name = IDENT -> Ast.Ident (name, None) | blob = UNPARSED_META -> - let meta = !parse_level2_meta_ref (Stream.of_string blob) in + let meta = !parse_level2_meta_ref (Ulexing.from_utf8_string blob) in match meta with | Ast.Variable (Ast.FreshVar _) | Ast.Variable (Ast.IdentVar _) -> meta @@ -601,7 +602,8 @@ EXTEND | LPAREN; p1 = term; SYMBOL ":"; p2 = term; RPAREN -> return_term loc (Ast.Cast (p1, p2)) | LPAREN; p = term; RPAREN -> p - | blob = UNPARSED_META -> !parse_level2_meta_ref (Stream.of_string blob) + | blob = UNPARSED_META -> + !parse_level2_meta_ref (Ulexing.from_utf8_string blob) ] ]; END @@ -618,12 +620,17 @@ let exc_located_wrapper f = | Stdpp.Exc_located (floc, exn) -> raise (Parse_error (floc, (Printexc.to_string exn))) -let parse_level1_pattern stream = - exc_located_wrapper (fun () -> Grammar.Entry.parse level1_pattern stream) -let parse_level2_ast stream = - exc_located_wrapper (fun () -> Grammar.Entry.parse level2_ast stream) -let parse_level2_meta stream = - exc_located_wrapper (fun () -> Grammar.Entry.parse level2_meta stream) +let parse_level1_pattern lexbuf = + exc_located_wrapper + (fun () -> Grammar.Entry.parse level1_pattern (Obj.magic lexbuf)) + +let parse_level2_ast lexbuf = + exc_located_wrapper + (fun () -> Grammar.Entry.parse level2_ast (Obj.magic lexbuf)) + +let parse_level2_meta lexbuf = + exc_located_wrapper + (fun () -> Grammar.Entry.parse level2_meta (Obj.magic lexbuf)) let _ = parse_level1_pattern_ref := parse_level1_pattern; diff --git a/helm/ocaml/cic_notation/cicNotationParser.mli b/helm/ocaml/cic_notation/cicNotationParser.mli index 80d79ef44..d614c68bc 100644 --- a/helm/ocaml/cic_notation/cicNotationParser.mli +++ b/helm/ocaml/cic_notation/cicNotationParser.mli @@ -29,11 +29,11 @@ exception Level_not_found of int (** {2 Parsing functions} *) (** concrete syntax pattern: notation level 1 *) -val parse_level1_pattern: char Stream.t -> CicNotationPt.term +val parse_level1_pattern: Ulexing.lexbuf -> CicNotationPt.term (** AST pattern: notation level 2 *) -val parse_level2_ast: char Stream.t -> CicNotationPt.term -val parse_level2_meta: char Stream.t -> CicNotationPt.term +val parse_level2_ast: Ulexing.lexbuf -> CicNotationPt.term +val parse_level2_meta: Ulexing.lexbuf -> CicNotationPt.term (** {2 Grammar extension} *) @@ -54,8 +54,10 @@ val delete: rule_id -> unit val level2_ast_grammar: Grammar.g val term : CicNotationPt.term Grammar.Entry.e + val let_defs : - (CicNotationPt.capture_variable * CicNotationPt.term * int) list Grammar.Entry.e + (CicNotationPt.capture_variable * CicNotationPt.term * int) list + Grammar.Entry.e (** {2 Debugging} *) diff --git a/helm/ocaml/cic_notation/grafiteParser.ml b/helm/ocaml/cic_notation/grafiteParser.ml index 9f07baa4a..5caba868b 100644 --- a/helm/ocaml/cic_notation/grafiteParser.ml +++ b/helm/ocaml/cic_notation/grafiteParser.ml @@ -373,10 +373,12 @@ EXTEND p2 = [ blob = UNPARSED_AST -> add_raw_attribute ~text:(sprintf "@{%s}" blob) - (CicNotationParser.parse_level2_ast (Stream.of_string blob)) + (CicNotationParser.parse_level2_ast + (Ulexing.from_utf8_string blob)) | blob = UNPARSED_META -> add_raw_attribute ~text:(sprintf "${%s}" blob) - (CicNotationParser.parse_level2_meta (Stream.of_string blob)) + (CicNotationParser.parse_level2_meta + (Ulexing.from_utf8_string blob)) ] -> let assoc = match assoc with @@ -390,7 +392,8 @@ EXTEND in let p1 = add_raw_attribute ~text:s - (CicNotationParser.parse_level1_pattern (Stream.of_string s)) + (CicNotationParser.parse_level1_pattern + (Ulexing.from_utf8_string s)) in (dir, p1, assoc, prec, p2) ] @@ -504,11 +507,14 @@ let exc_located_wrapper f = | Stdpp.Exc_located (floc, exn) -> raise (CicNotationParser.Parse_error (floc, (Printexc.to_string exn))) -let parse_statement stream = - exc_located_wrapper (fun () -> (Grammar.Entry.parse statement stream)) +let parse_statement lexbuf = + exc_located_wrapper + (fun () -> (Grammar.Entry.parse statement (Obj.magic lexbuf))) -let parse_dependencies stream = - let tok_stream,_ = CicNotationLexer.level2_ast_lexer.Token.tok_func stream in +let parse_dependencies lexbuf = + let tok_stream,_ = + CicNotationLexer.level2_ast_lexer.Token.tok_func (Obj.magic lexbuf) + in let rec parse acc = (parser | [< '("URI", u) >] -> diff --git a/helm/ocaml/cic_notation/grafiteParser.mli b/helm/ocaml/cic_notation/grafiteParser.mli index e6b549dab..36f1db49b 100644 --- a/helm/ocaml/cic_notation/grafiteParser.mli +++ b/helm/ocaml/cic_notation/grafiteParser.mli @@ -25,11 +25,11 @@ (** @raise End_of_file *) val parse_statement: - char Stream.t -> + Ulexing.lexbuf -> (CicNotationPt.term, CicNotationPt.term, GrafiteAst.reduction, GrafiteAst.obj, string) GrafiteAst.statement (** @raise End_of_file *) -val parse_dependencies: char Stream.t -> GrafiteAst.dependency list +val parse_dependencies: Ulexing.lexbuf -> GrafiteAst.dependency list diff --git a/helm/ocaml/cic_notation/test_dep.ml b/helm/ocaml/cic_notation/test_dep.ml index c40b6e7de..a2c7e392e 100644 --- a/helm/ocaml/cic_notation/test_dep.ml +++ b/helm/ocaml/cic_notation/test_dep.ml @@ -31,5 +31,8 @@ let _ = ic := open_in fname in Arg.parse [] open_file usage; - let deps = GrafiteParser.parse_dependencies (Stream.of_channel !ic) in + let deps = + GrafiteParser.parse_dependencies (Ulexing.from_utf8_channel !ic) + in List.iter (fun dep -> print_endline (GrafiteAstPp.pp_dependency dep)) deps + diff --git a/helm/ocaml/cic_notation/test_lexer.ml b/helm/ocaml/cic_notation/test_lexer.ml index c3569cda4..569e86e44 100644 --- a/helm/ocaml/cic_notation/test_lexer.ml +++ b/helm/ocaml/cic_notation/test_lexer.ml @@ -42,7 +42,9 @@ let _ = prerr_endline (Printf.sprintf "Unsupported level %s" l); exit 2 in - let token_stream = fst (lexer.Token.tok_func (Stream.of_channel !ic)) in + let token_stream = + fst (lexer.Token.tok_func (Obj.magic (Ulexing.from_utf8_channel !ic))) + in Printf.printf "Lexing notation level %s\n" !level; flush stdout; let rec dump () = let (a,b) = Stream.next token_stream in diff --git a/helm/ocaml/cic_notation/test_parser.ml b/helm/ocaml/cic_notation/test_parser.ml index 1ad6b924a..b3685232e 100644 --- a/helm/ocaml/cic_notation/test_parser.ml +++ b/helm/ocaml/cic_notation/test_parser.ml @@ -157,5 +157,5 @@ let _ = CicNotation.load_notation (Helm_registry.get "notation.core_file"); print_endline "done."; flush stdout; - process_stream (Stream.of_channel stdin) + process_stream (Ulexing.from_utf8_channel stdin) -- 2.39.2