1 (******************************************************)
2 (* Claudio Sacerdoti Coen <sacerdot@cs.unibo.it> *)
4 (******************************************************)
6 (* Surrogate Pairs are not accepted in XML files (is it true???) *)
7 exception SurrogatePairs;;
9 (* Interval (n,m) where n >m m *)
10 exception InvalidInterval of int * int;;
12 (* Given an ucs2 character code, returns it in utf8 *)
13 (* (as a concatenation of characters) *)
14 let char_ucs2_to_utf8 =
16 n when n >= 0xD800 && n <= 0xDFFF -> raise SurrogatePairs
17 | n when n <= 0x007F -> Types.Char n
18 | n when n <= 0x07FF ->
20 [[Types.Char (n lsr 6 land 0b00011111 lor 0b11000000)] ;
21 [Types.Char (n land 0b00111111 lor 0b10000000)]]
24 [[Types.Char (n lsr 12 land 0b00001111 lor 0b11100000)] ;
25 [Types.Char (n lsr 6 land 0b00111111 lor 0b10000000)] ;
26 [Types.Char (n land 0b00111111 lor 0b10000000)]]
29 (*CSC: Two functions for debugging pourposes only
31 let char_ucs2_to_utf8 =
33 n when n >= 0xD800 && n <= 0xDFFF -> assert false
34 | n when n <= 0x007F -> [[n]]
35 | n when n <= 0x07FF ->
36 [[(n lsr 6 land 0b00011111 lor 0b11000000)] ;
37 [(n land 0b00111111 lor 0b10000000)]]
39 [[(n lsr 12 land 0b00001111 lor 0b11100000)] ;
40 [(n lsr 6 land 0b00111111 lor 0b10000000)] ;
41 [(n land 0b00111111 lor 0b10000000)]]
47 | n -> bprint (n / 2) ^ string_of_int (n mod 2)
51 (* A few useful functions *)
55 | n -> e::(mklist e (n - 1))
59 let t = Types.Char 0b10111111 in
62 | n -> Types.Concat (mklist [t] n)
66 let b = Types.Char 0b10000000 in
74 [Types.Char n] -> n + 1
80 [Types.Char n] -> n - 1
84 (* Given two utf8-encoded extremes of an interval character code *)
85 (* whose 'length' is the same, it returns the utf8 regular expression *)
86 (* matching all the characters in the interval *)
87 let rec same_length_ucs2_to_utf8 =
88 let module T = Types in
90 (T.Char n, T.Char m) when n = m -> [T.Char n]
91 | (T.Char n, T.Char m) -> [T.Interval (n,m)]
92 | (T.Concat [hen ; [tln]], T.Concat [hem ; [tlm]]) when hen = hem ->
93 [T.Concat [hen ; same_length_ucs2_to_utf8 (tln,tlm)]]
94 | (T.Concat [hen ; [tln]], T.Concat ([hem ; [tlm]] as e2)) ->
95 (T.Concat [hen ; same_length_ucs2_to_utf8 (tln,sup 1)]) ::
96 (let shen = mysucc hen
97 and phem = mypred hem in
98 let succhen = [T.Char shen] in
100 same_length_ucs2_to_utf8 (T.Concat (succhen::(inf 1)), T.Concat e2)
102 (T.Concat [[T.Interval (shen, phem)] ;
103 [T.Interval (0b10000000,0b10111111)]])::
104 same_length_ucs2_to_utf8 (T.Concat (hem::(inf 1)), T.Concat e2)
106 (*same_length_ucs2_to_utf8 (T.Concat ((mysucc hen)::(inf 1)), T.Concat e2)*)
107 | (T.Concat (hen::tln), T.Concat (hem::tlm)) when hen = hem ->
108 [T.Concat [hen ; same_length_ucs2_to_utf8 (T.Concat tln, T.Concat tlm)]]
109 | (T.Concat (hen::tln), T.Concat ((hem::tlm) as e2)) ->
110 let n = List.length tln in
112 [hen ; same_length_ucs2_to_utf8 (T.Concat tln,sup n)]) ::
113 (let shen = mysucc hen
114 and phem = mypred hem in
115 let succhen = [T.Char shen] in
116 if succhen = hem then
117 same_length_ucs2_to_utf8 (T.Concat (succhen::(inf n)), T.Concat e2)
119 (T.Concat [[T.Interval (shen, phem)] ;
120 [T.Interval (0b10000000,0b10111111)] ;
121 [T.Interval (0b10000000,0b10111111)]]
123 same_length_ucs2_to_utf8 (T.Concat (hem::(inf n)), T.Concat e2)
125 (*same_length_ucs2_to_utf8 (T.Concat ((mysucc hen)::(inf n)),T.Concat e2)*)
129 (* Given an interval of ucs2 characters, splits *)
130 (* the list in subintervals whose extremes has *)
131 (* the same utf8 encoding length and, for each *)
132 (* extreme, calls same_length_ucs2_to_utf8 *)
133 let rec seq_ucs2_to_utf8 =
135 (n,_) when n >= 0xD800 && n <= 0xDFFF -> raise SurrogatePairs
136 | (_,n) when n >= 0xD800 && n <= 0xDFFF -> raise SurrogatePairs
137 | (n,m) when n > m -> raise (InvalidInterval (n,m))
138 | (n,m) when n = m -> [char_ucs2_to_utf8 n]
139 | (n,m) when n <= 0x07F && m > 0x07F ->
140 (seq_ucs2_to_utf8 (n,0x07F)) @ (seq_ucs2_to_utf8 (0x080,m))
141 | (n,m) when n <= 0x07FF && m > 0x07FF ->
142 (seq_ucs2_to_utf8 (n,0x07FF)) @ (seq_ucs2_to_utf8 (0x0800,m))
144 let utf8n = char_ucs2_to_utf8 n
145 and utf8m = char_ucs2_to_utf8 m in
146 same_length_ucs2_to_utf8 (utf8n,utf8m)
149 (* Given an ucs2 regual expression, returns *)
150 (* the corresponding utf8 regular expression *)
151 let ucs2_to_utf8 { Types.id = id ; Types.rel = rel } =
154 Types.Char i -> char_ucs2_to_utf8 i :: l2
155 | Types.Interval (l,u) -> seq_ucs2_to_utf8 (l,u) @ l2
156 | Types.Identifier _ as i -> i :: l2
157 | Types.Concat rell ->
158 let foo rel = List.fold_right aux rel [] in
159 Types.Concat (List.map foo rell) :: l2
161 { Types.id = id ; Types.rel = List.fold_right aux rel [] }
164 (* The function actually used to produce the output *)
165 let output = print_string ;;
167 (* padded_string_of_int i returns the string representing the *)
168 (* integer i (i < 256) using exactly 3 digits (example: 13 -> "013") *)
169 let padded_string_of_int i =
171 "00" ^ string_of_int i
173 "0" ^ string_of_int i
178 (* Two functions useful to print a definition *)
179 let rec print_disjunction ?(first = true) =
183 if not first then output " | " ;
185 print_disjunction ~first:false tl
188 Types.Char i -> output ("'\\" ^ padded_string_of_int i ^ "'")
189 | Types.Interval (l,u) ->
190 output ("['\\" ^ padded_string_of_int l ^ "'-'\\" ^
191 padded_string_of_int u ^ "']")
192 | Types.Identifier i -> output i
193 | Types.Concat rell ->
195 if List.length rel > 1 then
196 (output "(" ; print_disjunction rel ; output ")")
198 print_disjunction rel
203 (* print_definition prints a definition in the format expected by ocamllex *)
204 let print_definition { Types.id = id ; Types.rel = rel } =
205 output ("let " ^ id ^ " =\n ") ;
206 print_disjunction rel ;
212 let lexbuf = Lexing.from_channel stdin in
213 let ucs2_result = Parser.main Lexer.token lexbuf in
214 List.iter print_definition (List.map ucs2_to_utf8 ucs2_result)