("(" number (',' number)* ")")? (* reference spec *)
let regexp qstring = '"' [^ '"']* '"'
-let regexp hreftag = "<" [ 'A' 'a' ]
+let regexp xmarkup = '\005'
+let regexp ymarkup = '\006'
+
+let regexp hreftag = xmarkup [ 'A' 'a' ]
let regexp href = "href=\"" uri "\""
let regexp hreftitle = "title=" qstring
-let regexp hrefclose = "</" [ 'A' 'a' ] ">"
+let regexp hrefclose = xmarkup "/" [ 'A' 'a' ] ymarkup
+
+let regexp tag_cont = ident_letter | xml_digit | "_" | "-"
+let regexp gtagname = [ 'B' - 'Z' 'b' - 'z' ] | ident_letter tag_cont+
+let regexp attrname = ident_letter tag_cont*
+let regexp gopentag = xmarkup gtagname
+let regexp closetag = xmarkup "/" gtagname ymarkup
+let regexp attribute = attrname "=" qstring
+let regexp generictag =
+ gopentag (utf8_blank+ attribute)* ymarkup
let regexp tex_token = '\\' ident
| utf8_blank+ -> level2_meta_token lexbuf
| hreftag -> return lexbuf ("ATAG","")
| hrefclose -> return lexbuf ("ATAGEND","")
+ | generictag
+ | closetag -> level2_meta_token lexbuf
| ident ->
handle_keywords lexbuf (fun x -> List.mem x level2_meta_keywords) "IDENT"
| variable_ident -> return lexbuf ("IDENT", Ulexing.utf8_lexeme lexbuf)
if desc <> None || href <> None
then
(let s,e = HExtlib.loc_of_floc loc in
+
prerr_endline (Printf.sprintf "*** [%d,%d] \"%s\",\"%s\""
s e (so_pp href) (so_pp desc));
+
+ Printf.printf "*** [%d,%d] \"%s\",\"%s\""
+ s e (so_pp href) (so_pp desc);
LocalizeEnv.add loc (href,desc) loctable)
else loctable
;;
let level2_ast_token loctable status =
- let rec aux desc href =
+ let rec aux desc href in_tag =
lexer
| let_rec -> return lexbuf ("LETREC","")
| let_corec -> return lexbuf ("LETCOREC","")
| we_proved -> return lexbuf ("WEPROVED","")
| we_have -> return lexbuf ("WEHAVE","")
- | utf8_blank+ -> ligatures_token (aux desc href) lexbuf
+ | utf8_blank+ -> ligatures_token (aux desc href in_tag) lexbuf
| meta ->
let s = Ulexing.utf8_lexeme lexbuf in
return lexbuf ("META", String.sub s 1 (String.length s - 1))
| implicit -> return lexbuf ("IMPLICIT", "")
| placeholder -> return lexbuf ("PLACEHOLDER", "")
- | hreftag -> aux_in_tag None None lexbuf
- | hrefclose -> aux None None lexbuf
- | ident -> loctable :=
- update_table (loc_of_buf lexbuf) desc href !loctable;
- handle_keywords lexbuf (fun x -> StringSet.mem x status) "IDENT"
+ | hreftag -> aux_attr None None lexbuf
+ | hrefclose -> return lexbuf ("LEXING_ERROR", Ulexing.utf8_lexeme lexbuf)
+ (* ignore other tags *)
+ | generictag
+ | closetag -> ligatures_token (aux desc href in_tag) lexbuf
+ | ident -> if in_tag then
+ aux_close_tag desc href ("IDENT", Ulexing.utf8_lexeme lexbuf) lexbuf
+ else handle_keywords lexbuf (fun x -> StringSet.mem x status) "IDENT"
| variable_ident ->
return lexbuf ("IDENT", Ulexing.utf8_lexeme lexbuf)
- | pident -> loctable :=
- update_table (loc_of_buf lexbuf) desc href !loctable;
- handle_keywords lexbuf (fun x -> StringSet.mem x status) "PIDENT"
- | number -> loctable :=
- update_table (loc_of_buf lexbuf) desc href !loctable;
- return lexbuf ("NUMBER", Ulexing.utf8_lexeme lexbuf)
- | tex_token -> loctable :=
- update_table (loc_of_buf lexbuf) desc href !loctable;
- return lexbuf (expand_macro lexbuf)
+ | pident -> if in_tag then
+ aux_close_tag desc href ("PIDENT", Ulexing.utf8_lexeme lexbuf) lexbuf
+ else handle_keywords lexbuf (fun x -> StringSet.mem x status) "PIDENT"
+ | number -> let token = "NUMBER", Ulexing.utf8_lexeme lexbuf
+ in
+ if in_tag then
+ aux_close_tag desc href token lexbuf
+ else return lexbuf token
+ | tex_token -> let token = expand_macro lexbuf
+ in
+ if in_tag then
+ aux_close_tag desc href token lexbuf
+ else return lexbuf token
| nreference -> return lexbuf ("NREF", Ulexing.utf8_lexeme lexbuf)
| uri -> return lexbuf ("URI", Ulexing.utf8_lexeme lexbuf)
| qstring ->
Ulexing.utf8_sub_lexeme lexbuf 2 (Ulexing.lexeme_length lexbuf - 4)
in
return lexbuf ("NOTE", comment) *)
- ligatures_token (aux desc href) lexbuf
+ ligatures_token (aux desc href in_tag) lexbuf
| begincomment -> return lexbuf ("BEGINCOMMENT","")
| endcomment -> return lexbuf ("ENDCOMMENT","")
| eof -> return_eoi lexbuf
- | _ -> loctable :=
- update_table (loc_of_buf lexbuf) desc href !loctable;
- return_symbol lexbuf (Ulexing.utf8_lexeme lexbuf)
-
- and aux_in_tag desc href = lexer
- | utf8_blank+ -> ligatures_token (aux_in_tag desc href) lexbuf
+ | _ -> let token = "SYMBOL", (Ulexing.utf8_lexeme lexbuf)
+ in
+ if in_tag then
+ aux_close_tag desc href token lexbuf
+ else return lexbuf token
+
+ and aux_attr desc href = lexer
+ | utf8_blank+ -> ligatures_token (aux_attr desc href) lexbuf
| href ->
- aux_in_tag desc
+ aux_attr desc
(Some (Ulexing.utf8_sub_lexeme lexbuf 6 (Ulexing.lexeme_length lexbuf - 7)))
lexbuf
| hreftitle ->
- aux_in_tag
+ aux_attr
(Some (Ulexing.utf8_sub_lexeme lexbuf 7 (Ulexing.lexeme_length lexbuf - 8)))
href lexbuf
- | ">" -> aux desc href lexbuf
- | _ -> aux None None lexbuf
- in aux None None
+ | ymarkup -> aux desc href true lexbuf
+ | _ -> return lexbuf ("LEXING_ERROR", Ulexing.utf8_lexeme lexbuf)
+
+(* and aux_in_tag desc href = lexer
+ | ident -> loctable :=
+ update_table (loc_of_buf lexbuf) desc href !loctable;
+ handle_keywords lexbuf (fun x -> StringSet.mem x status) "IDENT"
+ | variable_ident ->
+ return lexbuf ("IDENT", Ulexing.utf8_lexeme lexbuf)
+ | pident -> loctable :=
+ update_table (loc_of_buf lexbuf) desc href !loctable;
+ handle_keywords lexbuf (fun x -> StringSet.mem x status) "PIDENT"
+ | number -> loctable :=
+ update_table (loc_of_buf lexbuf) desc href !loctable;
+ return lexbuf ("NUMBER", Ulexing.utf8_lexeme lexbuf)
+ | tex_token -> loctable :=
+ update_table (loc_of_buf lexbuf) desc href !loctable;
+ return lexbuf (expand_macro lexbuf)
+ | _ -> loctable :=
+ update_table (loc_of_buf lexbuf) desc href !loctable;
+ return_symbol lexbuf (Ulexing.utf8_lexeme lexbuf)
+ *)
+ and aux_close_tag desc href token = lexer
+ | hrefclose -> loctable := update_table (loc_of_buf lexbuf) desc href !loctable;
+ return lexbuf token
+ | _ -> return lexbuf ("LEXING_ERROR", Ulexing.utf8_lexeme lexbuf)
+ in aux None None false
let rec level1_pattern_token =
lexer
| number -> return lexbuf ("NUMBER", Ulexing.utf8_lexeme lexbuf)
| hreftag -> return lexbuf ("ATAG", "")
| hrefclose -> return lexbuf ("ATAGEND","")
- | ident ->handle_keywords lexbuf (fun x -> List.mem x level1_keywords) "IDENT"
+ | generictag
+ | closetag -> ligatures_token level1_pattern_token lexbuf
+ | ident -> handle_keywords lexbuf (fun x -> List.mem x level1_keywords) "IDENT"
| variable_ident -> return lexbuf ("IDENT", Ulexing.utf8_lexeme lexbuf)
| pident->handle_keywords lexbuf (fun x->List.mem x level1_keywords) "PIDENT"
| color -> return lexbuf ("COLOR", Ulexing.utf8_lexeme lexbuf)