X-Git-Url: http://matita.cs.unibo.it/gitweb/?a=blobdiff_plain;f=helm%2FDEVEL%2Fpxp%2Fpxp%2Ftools%2Fucs2_to_utf8%2Fucs2_to_utf8.ml;fp=helm%2FDEVEL%2Fpxp%2Fpxp%2Ftools%2Fucs2_to_utf8%2Fucs2_to_utf8.ml;h=1512d2300860148ac92a7cc1d0ecd40010ef0dd5;hb=c03d2c1fdab8d228cb88aaba5ca0f556318bebc5;hp=0000000000000000000000000000000000000000;hpb=758057e85325f94cd88583feb1fdf6b038e35055;p=helm.git diff --git a/helm/DEVEL/pxp/pxp/tools/ucs2_to_utf8/ucs2_to_utf8.ml b/helm/DEVEL/pxp/pxp/tools/ucs2_to_utf8/ucs2_to_utf8.ml new file mode 100644 index 000000000..1512d2300 --- /dev/null +++ b/helm/DEVEL/pxp/pxp/tools/ucs2_to_utf8/ucs2_to_utf8.ml @@ -0,0 +1,215 @@ +(******************************************************) +(* Claudio Sacerdoti Coen *) +(* 14/05/2000 *) +(******************************************************) + +(* Surrogate Pairs are not accepted in XML files (is it true???) *) +exception SurrogatePairs;; + +(* Interval (n,m) where n >m m *) +exception InvalidInterval of int * int;; + +(* Given an ucs2 character code, returns it in utf8 *) +(* (as a concatenation of characters) *) +let char_ucs2_to_utf8 = + function + n when n >= 0xD800 && n <= 0xDFFF -> raise SurrogatePairs + | n when n <= 0x007F -> Types.Char n + | n when n <= 0x07FF -> + Types.Concat + [[Types.Char (n lsr 6 land 0b00011111 lor 0b11000000)] ; + [Types.Char (n land 0b00111111 lor 0b10000000)]] + | n -> + Types.Concat + [[Types.Char (n lsr 12 land 0b00001111 lor 0b11100000)] ; + [Types.Char (n lsr 6 land 0b00111111 lor 0b10000000)] ; + [Types.Char (n land 0b00111111 lor 0b10000000)]] +;; + +(*CSC: Two functions for debugging pourposes only + +let char_ucs2_to_utf8 = + function + n when n >= 0xD800 && n <= 0xDFFF -> assert false + | n when n <= 0x007F -> [[n]] + | n when n <= 0x07FF -> + [[(n lsr 6 land 0b00011111 lor 0b11000000)] ; + [(n land 0b00111111 lor 0b10000000)]] + | n -> + [[(n lsr 12 land 0b00001111 lor 0b11100000)] ; + [(n lsr 6 land 0b00111111 lor 0b10000000)] ; + [(n land 0b00111111 lor 0b10000000)]] +;; + +let rec bprint = + function + 0 -> "" + | n -> bprint (n / 2) ^ string_of_int (n mod 2) +;; +*) + +(* A few useful functions *) +let rec mklist e = + function + 0 -> [] + | n -> e::(mklist e (n - 1)) +;; + +let sup = + let t = Types.Char 0b10111111 in + function + 1 -> t + | n -> Types.Concat (mklist [t] n) +;; + +let rec inf = + let b = Types.Char 0b10000000 in + function + 1 -> [[b]] + | n -> mklist [b] n +;; + +let mysucc = + function + [Types.Char n] -> n + 1 + | _ -> assert false +;; + +let mypred = + function + [Types.Char n] -> n - 1 + | _ -> assert false +;; + +(* Given two utf8-encoded extremes of an interval character code *) +(* whose 'length' is the same, it returns the utf8 regular expression *) +(* matching all the characters in the interval *) +let rec same_length_ucs2_to_utf8 = + let module T = Types in + function + (T.Char n, T.Char m) when n = m -> [T.Char n] + | (T.Char n, T.Char m) -> [T.Interval (n,m)] + | (T.Concat [hen ; [tln]], T.Concat [hem ; [tlm]]) when hen = hem -> + [T.Concat [hen ; same_length_ucs2_to_utf8 (tln,tlm)]] + | (T.Concat [hen ; [tln]], T.Concat ([hem ; [tlm]] as e2)) -> + (T.Concat [hen ; same_length_ucs2_to_utf8 (tln,sup 1)]) :: + (let shen = mysucc hen + and phem = mypred hem in + let succhen = [T.Char shen] in + if succhen = hem then + same_length_ucs2_to_utf8 (T.Concat (succhen::(inf 1)), T.Concat e2) + else + (T.Concat [[T.Interval (shen, phem)] ; + [T.Interval (0b10000000,0b10111111)]]):: + same_length_ucs2_to_utf8 (T.Concat (hem::(inf 1)), T.Concat e2) + ) + (*same_length_ucs2_to_utf8 (T.Concat ((mysucc hen)::(inf 1)), T.Concat e2)*) + | (T.Concat (hen::tln), T.Concat (hem::tlm)) when hen = hem -> + [T.Concat [hen ; same_length_ucs2_to_utf8 (T.Concat tln, T.Concat tlm)]] + | (T.Concat (hen::tln), T.Concat ((hem::tlm) as e2)) -> + let n = List.length tln in + (T.Concat + [hen ; same_length_ucs2_to_utf8 (T.Concat tln,sup n)]) :: + (let shen = mysucc hen + and phem = mypred hem in + let succhen = [T.Char shen] in + if succhen = hem then + same_length_ucs2_to_utf8 (T.Concat (succhen::(inf n)), T.Concat e2) + else + (T.Concat [[T.Interval (shen, phem)] ; + [T.Interval (0b10000000,0b10111111)] ; + [T.Interval (0b10000000,0b10111111)]] + ):: + same_length_ucs2_to_utf8 (T.Concat (hem::(inf n)), T.Concat e2) + ) + (*same_length_ucs2_to_utf8 (T.Concat ((mysucc hen)::(inf n)),T.Concat e2)*) + | _ -> assert false +;; + +(* Given an interval of ucs2 characters, splits *) +(* the list in subintervals whose extremes has *) +(* the same utf8 encoding length and, for each *) +(* extreme, calls same_length_ucs2_to_utf8 *) +let rec seq_ucs2_to_utf8 = + function + (n,_) when n >= 0xD800 && n <= 0xDFFF -> raise SurrogatePairs + | (_,n) when n >= 0xD800 && n <= 0xDFFF -> raise SurrogatePairs + | (n,m) when n > m -> raise (InvalidInterval (n,m)) + | (n,m) when n = m -> [char_ucs2_to_utf8 n] + | (n,m) when n <= 0x07F && m > 0x07F -> + (seq_ucs2_to_utf8 (n,0x07F)) @ (seq_ucs2_to_utf8 (0x080,m)) + | (n,m) when n <= 0x07FF && m > 0x07FF -> + (seq_ucs2_to_utf8 (n,0x07FF)) @ (seq_ucs2_to_utf8 (0x0800,m)) + | (n,m) -> + let utf8n = char_ucs2_to_utf8 n + and utf8m = char_ucs2_to_utf8 m in + same_length_ucs2_to_utf8 (utf8n,utf8m) +;; + +(* Given an ucs2 regual expression, returns *) +(* the corresponding utf8 regular expression *) +let ucs2_to_utf8 { Types.id = id ; Types.rel = rel } = + let rec aux re l2 = + match re with + Types.Char i -> char_ucs2_to_utf8 i :: l2 + | Types.Interval (l,u) -> seq_ucs2_to_utf8 (l,u) @ l2 + | Types.Identifier _ as i -> i :: l2 + | Types.Concat rell -> + let foo rel = List.fold_right aux rel [] in + Types.Concat (List.map foo rell) :: l2 + in + { Types.id = id ; Types.rel = List.fold_right aux rel [] } +;; + +(* The function actually used to produce the output *) +let output = print_string ;; + +(* padded_string_of_int i returns the string representing the *) +(* integer i (i < 256) using exactly 3 digits (example: 13 -> "013") *) +let padded_string_of_int i = + if i < 10 then + "00" ^ string_of_int i + else if i < 100 then + "0" ^ string_of_int i + else + string_of_int i +;; + +(* Two functions useful to print a definition *) +let rec print_disjunction ?(first = true) = + function + [] -> () + | he::tl -> + if not first then output " | " ; + print_re he ; + print_disjunction ~first:false tl +and print_re = + function + Types.Char i -> output ("'\\" ^ padded_string_of_int i ^ "'") + | Types.Interval (l,u) -> + output ("['\\" ^ padded_string_of_int l ^ "'-'\\" ^ + padded_string_of_int u ^ "']") + | Types.Identifier i -> output i + | Types.Concat rell -> + let foo rel = + if List.length rel > 1 then + (output "(" ; print_disjunction rel ; output ")") + else + print_disjunction rel + in + List.iter foo rell +;; + +(* print_definition prints a definition in the format expected by ocamllex *) +let print_definition { Types.id = id ; Types.rel = rel } = + output ("let " ^ id ^ " =\n ") ; + print_disjunction rel ; + output "\n\n" +;; + +(* main *) +let _ = + let lexbuf = Lexing.from_channel stdin in + let ucs2_result = Parser.main Lexer.token lexbuf in + List.iter print_definition (List.map ucs2_to_utf8 ucs2_result) +;;