(* Copyright (C) 2004, HELM Team.
 * 
 * This file is part of HELM, an Hypertextual, Electronic
 * Library of Mathematics, developed at the Computer Science
 * Department, University of Bologna, Italy.
 * 
 * HELM is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * HELM is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with HELM; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston,
 * MA  02111-1307, USA.
 * 
 * For details, see the HELM World-Wide-Web page,
 * http://helm.cs.unibo.it/
 *)

exception Error of int * int * string

exception Not_an_extended_ident

let regexp alpha = [ 'a' - 'z' 'A' - 'Z' ]
let regexp digit = [ '0' - '9' ]
let regexp blank = [ ' ' '\t' '\n' ]
let regexp paren = [ '(' '[' '{' ')' ']' '}' ]
let regexp implicit = '?'
let regexp placeholder = '%'
let regexp symbol_char =
  [^ 'a' - 'z' 'A' - 'Z' '0' - '9'
     ' ' '\t' '\n'
     '\\' '(' '[' '{' ')' ']' '}' '?'
  ]

let regexp comment_char = [^'*'] | '*'[^')']
let regexp note = "(*" ([^'*'] | "**") comment_char* "*)"

let regexp commentbegin = "(**" blank
let regexp commentend = "*)"

let regexp blanks = blank+
let regexp num = digit+
let regexp tex_token = '\\' alpha+
let regexp symbol = symbol_char
let regexp ident_cont = alpha | num | '_' | '\''
let regexp ident_cont' = ident_cont | tex_token
let regexp ident = (alpha ident_cont*) | ('_' ident_cont+)
let regexp ident' = ((alpha | tex_token) ident_cont'*) | ('_' ident_cont'+)
let regexp uri_step = (alpha (ident_cont|'-')*)
let regexp meta = implicit num
let regexp qstring = '"' [^ '"']* '"'
let regexp uri_suffix = "con"|"ind"|"var"
let regexp uri =
  ("cic:/" | "theory:/")              (* schema *)
  uri_step ('/' uri_step)*            (* path *)
  ('.' uri_suffix)+                   (* ext *)
  ("#xpointer(" num ('/' num)+ ")")?  (* xpointer *)
(* let regexp catchall = .* *)

let keywords = Hashtbl.create 17
let _ =
  List.iter (fun keyword -> Hashtbl.add keywords keyword ("", keyword))
    [ "Prop"; "Type"; "Set"; "let"; "Let"; "rec"; "using"; "match"; "with";
    "in"; "and"; "to"; "as"; "on"; "names"]

let error lexbuf msg =
  raise (Error (Ulexing.lexeme_start lexbuf, Ulexing.lexeme_end lexbuf, msg))
let error_at_end lexbuf msg =
  raise (Error (Ulexing.lexeme_end lexbuf, Ulexing.lexeme_end lexbuf, msg))

let return lexbuf token =
  let flocation_begin =
    { Lexing.pos_fname = ""; Lexing.pos_lnum = -1; Lexing.pos_bol = -1;
      Lexing.pos_cnum = Ulexing.lexeme_start lexbuf }
  in
  let flocation_end =
    { flocation_begin with Lexing.pos_cnum = Ulexing.lexeme_end lexbuf }
  in
  (token, (flocation_begin, flocation_end))

(*
let parse_ext_ident ident =
  let len = String.length ident in
  let buf = Buffer.create len in
  let in_tex_token = ref false in
  let tex_token = Buffer.create 10 in
  try
    for i = 0 to len - 1 do
      match ident.[i] with
      | '\' when not !in_tex_token ->
          if i < len - 1 &&
          in_tex_token := true
    done
  with Invalid_argument -> assert false

let rec token' = lexer
  | ident' ->
      (try
        let ident = parse_ext_ident (Ulexing.utf8_lexeme lexbuf) in
        return lexbuf ("IDENT'", ident)
      with Not_an_extended_ident ->
        Ulexing.rollback lexbuf;
        token lexbuf)
  | _ ->
      Ulexing.rollback lexbuf;
      token lexbuf

and token = lexer
*)

let remove_quotes s = String.sub s 1 (String.length s - 2)

let rec token comments = lexer
  | blanks -> token comments lexbuf
  | uri -> return lexbuf ("URI", Ulexing.utf8_lexeme lexbuf)
  | ident ->
      let lexeme = Ulexing.utf8_lexeme lexbuf in
      (try
        return lexbuf (Hashtbl.find keywords lexeme)
      with Not_found -> return lexbuf ("IDENT", lexeme))
  | num -> return lexbuf ("NUM", Ulexing.utf8_lexeme lexbuf)
  | paren -> return lexbuf ("PAREN", Ulexing.utf8_lexeme lexbuf)
  | meta -> return lexbuf ("META", Ulexing.utf8_lexeme lexbuf)
  | implicit -> return lexbuf ("IMPLICIT", Ulexing.utf8_lexeme lexbuf)
  | placeholder -> return lexbuf ("PLACEHOLDER", Ulexing.utf8_lexeme lexbuf)
  | qstring ->
      return lexbuf ("QSTRING", remove_quotes (Ulexing.utf8_lexeme lexbuf))
  | symbol -> return lexbuf ("SYMBOL", Ulexing.utf8_lexeme lexbuf)
  | tex_token ->
      let macro =
        Ulexing.utf8_sub_lexeme lexbuf 1 (Ulexing.lexeme_length lexbuf - 1)
      in
      (try
        return lexbuf ("SYMBOL", Utf8Macro.expand macro)
      with Utf8Macro.Macro_not_found _ ->
        return lexbuf ("SYMBOL", Ulexing.utf8_lexeme lexbuf))
  | note -> 
      (*if comments then*)
        let comment =
          Ulexing.utf8_sub_lexeme lexbuf 2 (Ulexing.lexeme_length lexbuf - 4)
        in
        return lexbuf ("NOTE", comment)
      (*else
        token comments lexbuf*)
  | commentbegin -> return lexbuf ("BEGINCOMMENT","")
  | commentend -> return lexbuf ("ENDCOMMENT","")
  | eof -> return lexbuf ("EOI", "")
  | _ -> error lexbuf "Invalid character"

let tok_func comments stream =
  let lexbuf = Ulexing.from_utf8_stream stream in
  Token.make_stream_and_flocation
    (fun () ->
      try
	token comments lexbuf
      with
      | Ulexing.Error -> error_at_end lexbuf "Unexpected character"
      | Ulexing.InvalidCodepoint i -> error_at_end lexbuf "Invalid code point")

let cic_lexer ?(comments = false) () =
  {
    Token.tok_func = tok_func comments;
    Token.tok_using = (fun _ -> ());
    Token.tok_removing = (fun _ -> ()); 
    Token.tok_match = Token.default_match;
    Token.tok_text = Token.lexer_text;
    Token.tok_comm = None;
  }