1 (* Copyright (C) 2005, HELM Team.
3 * This file is part of HELM, an Hypertextual, Electronic
4 * Library of Mathematics, developed at the Computer Science
5 * Department, University of Bologna, Italy.
7 * HELM is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
12 * HELM is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with HELM; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place - Suite 330, Boston,
22 * For details, see the HELM World-Wide-Web page,
23 * http://helm.cs.unibo.it/
26 (* $Id: cicNotationLexer.ml 11028 2010-11-02 17:08:43Z tassi $ *)
30 exception Error of int * int * string
32 module StringSet = Set.Make(String)
37 let regexp number = xml_digit+
39 let regexp ident_letter = [ 'a' - 'z' 'A' - 'Z' ]
41 (* must be in sync with "is_ligature_char" below *)
42 let regexp ligature_char = [ "'`~!?@*()[]<>-+=|:;.,/\"" ]
43 let regexp ligature = ligature_char ligature_char+
45 let regexp ident_decoration = '\'' | '?' | '`'
46 let regexp ident_cont = ident_letter | xml_digit | '_'
47 let regexp ident_start = ident_letter
48 let regexp ident = ident_letter ident_cont* ident_decoration*
49 let regexp variable_ident = '_' '_' number
50 let regexp pident = '_' ident
52 let regexp tex_token = '\\' ident
54 let regexp csymbol = '\'' ident
56 let regexp begincom = "(*"
57 let regexp endcom = "*)"
58 let regexp qstring = '"' [^ '"']* '"'
63 (* (string, int) Hashtbl.t, with multiple bindings.
64 * int is the unicode codepoint *)
65 let ligatures = Hashtbl.create 23
69 (fun (ligature, symbol) -> Hashtbl.add ligatures ligature symbol)
70 [ ("->", <:unicode<to>>); ("=>", <:unicode<Rightarrow>>);
71 (":=", <:unicode<def>>);
74 let error lexbuf msg =
75 let begin_cnum, end_cnum = Ulexing.loc lexbuf in
76 raise (Error (begin_cnum, end_cnum, msg))
77 let error_at_end lexbuf msg =
78 let begin_cnum, end_cnum = Ulexing.loc lexbuf in
79 raise (Error (begin_cnum, end_cnum, msg))
83 let return_symbol lexbuf s = return lexbuf ("SYMBOL", s)
84 let return_eoi lexbuf = return lexbuf ("EOI", "")
87 let remove_quotes s = String.sub s 1 (String.length s - 2)
91 (* let lexbuf = Ulexing.from_utf8_stream stream in *)
92 (** XXX Obj.magic rationale.
94 * camlp5 constraints the tok_func field of Token.glexer to have type:
95 * Stream.t char -> (Stream.t 'te * flocation_function)
96 * In order to use ulex we have (in theory) to instantiate a new lexbuf each
97 * time a char Stream.t is passed, destroying the previous lexbuf which may
98 * have consumed a character from the old stream which is lost forever :-(
100 * Instead of passing to camlp5 a char Stream.t we pass a lexbuf, casting it to
101 * char Stream.t with Obj.magic where needed.
103 let lexbuf = Obj.magic stream in
104 Token.make_stream_and_location
109 | Ulexing.Error -> error_at_end lexbuf "Unexpected character"
110 | Ulexing.InvalidCodepoint p ->
111 error_at_end lexbuf (sprintf "Invalid code point: %d" p))
114 Token.tok_func = tok_func;
115 Token.tok_using = (fun _ -> ());
116 Token.tok_removing = (fun _ -> ());
117 Token.tok_match = (* Token.default_match *) (fun (a,b) () -> a);
118 Token.tok_text = Token.lexer_text;
119 Token.tok_comm = None;
122 let expand_macro lexbuf =
124 Ulexing.utf8_sub_lexeme lexbuf 1 (Ulexing.lexeme_length lexbuf - 1)
127 ("SYMBOL" ^ (Utf8Macro.expand macro))
128 with Utf8Macro.Macro_not_found _ ->
129 (* FG: unexpanded TeX macros are terminated by a space for rendering *)
130 "SYMBOL" ^ (Ulexing.utf8_lexeme lexbuf ^ " ")
132 let remove_quotes s = String.sub s 1 (String.length s - 2)
133 let remove_left_quote s = String.sub s 1 (String.length s - 1)
135 let print = Printf.fprintf
137 let handle_full_id ch id uri =
138 print ch "<A href=\"%s\">%s</A>" uri id
140 let handle_full_sym ch s uri desc =
141 let uri = try HExtlib.unopt uri
142 with Failure _ -> "cic:/fakeuri.def(1)"
143 in print ch "<A title=\"%s\" href=\"%s\">%s</A>" desc uri s
145 let handle_full_num ch num uri desc =
146 let uri = try HExtlib.unopt uri
147 with Failure _ -> "cic:/fakeuri.def(1)"
148 in print ch "<A title=\"%s\" href=\"%s\">%s</A>" desc uri num
150 let handle_tex ch s =
151 (* print "TEX:%s" (remove_left_quote (Ulexing.utf8_lexeme lexbuf)) *)
154 let return_with_loc token begin_cnum end_cnum =
155 let flocation = HExtlib.floc_of_loc (begin_cnum,end_cnum) in
158 let return lexbuf token =
159 let begin_cnum, end_cnum = Ulexing.loc lexbuf in
160 return_with_loc token begin_cnum end_cnum
162 (** @param k continuation to be invoked when no ligature has been found *)
163 let ligatures_token k =
166 let lexeme = Ulexing.utf8_lexeme lexbuf in
167 (match List.rev (Hashtbl.find_all ligatures lexeme) with
168 | [] -> (* ligature not found, rollback and try default lexer *)
169 Ulexing.rollback lexbuf;
171 | default_lig :: _ -> (* ligatures found, use the default one *)
172 return_symbol lexbuf default_lig) *)
173 | eof -> return lexbuf ()
174 | _ -> (* not a ligature, rollback and try default lexer *)
175 Ulexing.rollback lexbuf;
178 let handle_interpr loc literal interp outch =
180 (match DisambiguateTypes.InterprEnv.find loc interp with
181 | GrafiteAst.Ident_alias (_,uri) ->
182 handle_full_id outch literal uri
183 | GrafiteAst.Symbol_alias (_,uri,desc) ->
184 handle_full_sym outch literal uri desc
185 | GrafiteAst.Number_alias (uri,desc) ->
186 handle_full_num outch literal uri desc)
187 with Not_found -> print outch "%s" literal
190 let level2_ast_token interp outch =
191 let rec aux nesting =
193 | begincom -> (print outch "%s" (Ulexing.utf8_lexeme lexbuf);
194 aux (nesting+1) lexbuf)
195 | endcom -> (print outch "%s" (Ulexing.utf8_lexeme lexbuf);
196 aux (max 0 (nesting-1)) lexbuf)
197 | eof -> return lexbuf ()
198 | _ -> if nesting > 0
199 then (print outch "%s" (Ulexing.utf8_lexeme lexbuf);
201 else (Ulexing.rollback lexbuf;
205 | ident -> let loc = HExtlib.floc_of_loc (Ulexing.loc lexbuf) in
206 let id = Ulexing.utf8_lexeme lexbuf in
207 handle_interpr loc id interp outch;
210 let loc = HExtlib.floc_of_loc (Ulexing.loc lexbuf) in
211 let id = Ulexing.utf8_lexeme lexbuf in
212 handle_interpr loc id interp outch;
214 | pident -> let loc = HExtlib.floc_of_loc (Ulexing.loc lexbuf) in
215 let id = Ulexing.utf8_lexeme lexbuf in
216 handle_interpr loc id interp outch;
219 let loc = HExtlib.floc_of_loc (Ulexing.loc lexbuf) in
220 let num = Ulexing.utf8_lexeme lexbuf in
221 handle_interpr loc num interp outch;
224 let loc = HExtlib.floc_of_loc (Ulexing.loc lexbuf) in
225 let s = Ulexing.utf8_lexeme lexbuf in
226 handle_interpr loc s interp outch;
228 | qstring -> (print outch "%s" (Ulexing.utf8_lexeme lexbuf); aux 0 lexbuf)
230 let loc = HExtlib.floc_of_loc (Ulexing.loc lexbuf) in
231 let s = Ulexing.utf8_lexeme lexbuf in
232 handle_interpr loc s interp outch;
234 | eof -> return lexbuf ()
236 let loc = HExtlib.floc_of_loc (Ulexing.loc lexbuf) in
237 let s = Ulexing.utf8_lexeme lexbuf in
238 handle_interpr loc s interp outch;
243 (* API implementation *)
245 (*let mk_small_lexer interpr keywords =
246 let initial_keywords =
247 [ "CProp"; "Prop"; "Type"; "Set"; "let"; "match";
248 "with"; "in"; "and"; "to"; "as"; "on"; "return"; "done";
252 List.fold_right StringSet.add (initial_keywords @ keywords) StringSet.empty
254 mk_lexer (level2_ast_token outch interpr)
257 let mk_small_printer interpr outch =
258 let initial_keywords =
259 [ "CProp"; "Prop"; "Type"; "Set"; "let"; "match";
260 "with"; "in"; "and"; "to"; "as"; "on"; "return"; "done";
264 List.fold_right StringSet.add initial_keywords StringSet.empty
266 level2_ast_token interpr outch
269 (* let mk_small_lexer interpr = mk_small_lexer interpr [];;*)