X-Git-Url: http://matita.cs.unibo.it/gitweb/?a=blobdiff_plain;f=helm%2FDEVEL%2Fpxp%2Fnetstring%2Fnethtml.ml;fp=helm%2FDEVEL%2Fpxp%2Fnetstring%2Fnethtml.ml;h=7f9d983cddd85529fb0af589cdef78ae2a2d4848;hb=c03d2c1fdab8d228cb88aaba5ca0f556318bebc5;hp=0000000000000000000000000000000000000000;hpb=758057e85325f94cd88583feb1fdf6b038e35055;p=helm.git diff --git a/helm/DEVEL/pxp/netstring/nethtml.ml b/helm/DEVEL/pxp/netstring/nethtml.ml new file mode 100644 index 000000000..7f9d983cd --- /dev/null +++ b/helm/DEVEL/pxp/netstring/nethtml.ml @@ -0,0 +1,276 @@ +(* $Id$ + * ---------------------------------------------------------------------- + * + *) + +open Nethtml_scanner;; + +type document = + Element of (string * (string*string) list * document list) + | Data of string +;; + + +exception End_of_scan;; + + +let no_end_tag = (* empty HTML elements *) + ref + [ "isindex"; + "base"; + "meta"; + "link"; + "hr"; + "input"; + "img"; + "param"; + "basefont"; + "br"; + "area"; + ] +;; + + +let special_tag = (* other lexical rules *) + ref + [ "script"; + "style"; + ] +;; + + +let rec parse_comment buf = + let t = scan_comment buf in + match t with + Mcomment -> + parse_comment buf + | Eof -> + raise End_of_scan + | _ -> + () +;; + + +let rec parse_doctype buf = + let t = scan_doctype buf in + match t with + Mdoctype -> + parse_doctype buf + | Eof -> + raise End_of_scan + | _ -> + () +;; + + +let parse_document buf = + let current_name = ref "" in + let current_atts = ref [] in + let current_subs = ref [] in + let stack = Stack.create() in + + let parse_atts() = + let rec next_no_space() = + match scan_element buf with + Space _ -> next_no_space() + | t -> t + in + + let rec parse_atts_lookahead next = + match next with + Relement -> [] + | Name n -> + begin match next_no_space() with + Is -> + begin match next_no_space() with + Name v -> + (String.lowercase n, String.uppercase v) :: + parse_atts_lookahead (next_no_space()) + | Literal v -> + (String.lowercase n,v) :: + parse_atts_lookahead (next_no_space()) + | Eof -> + raise End_of_scan + | Relement -> + (* Illegal *) + [] + | _ -> + (* Illegal *) + parse_atts_lookahead (next_no_space()) + end + | Eof -> + raise End_of_scan + | Relement -> + (* <==> *) + [ String.lowercase n, String.lowercase n ] + | next' -> + (* assume <==> *) + ( String.lowercase n, String.lowercase n ) :: + parse_atts_lookahead next' + end + | Eof -> + raise End_of_scan + | _ -> + (* Illegal *) + parse_atts_lookahead (next_no_space()) + in + parse_atts_lookahead (next_no_space()) + in + + let rec parse_special name = + (* Parse until *) + match scan_special buf with + Lelementend n -> + if n = name then + "" + else + " + raise End_of_scan + | Cdata s -> + s ^ parse_special name + | _ -> + (* Illegal *) + parse_special name + in + + let rec skip_element() = + (* Skip until ">" *) + match scan_element buf with + Relement -> + () + | Eof -> + raise End_of_scan + | _ -> + skip_element() + in + + let rec parse_next() = + let t = scan_document buf in + match t with + Lcomment -> + parse_comment buf; + parse_next() + | Ldoctype -> + parse_doctype buf; + parse_next() + | Lelement name -> + let name = String.lowercase name in + if List.mem name !no_end_tag then begin + let atts = parse_atts() in + current_subs := (Element(name, atts, [])) :: !current_subs; + parse_next() + end + else if List.mem name !special_tag then begin + let atts = parse_atts() in + let data = parse_special name in + (* Read until ">" *) + skip_element(); + current_subs := (Element(name, atts, [Data data])) :: !current_subs; + parse_next() + end + else begin + let atts = parse_atts() in + Stack.push (!current_name, !current_atts, !current_subs) stack; + current_name := name; + current_atts := atts; + current_subs := []; + parse_next() + end + | Cdata data -> + current_subs := (Data data) :: !current_subs; + parse_next() + | Lelementend name -> + let name = String.lowercase name in + (* Read until ">" *) + skip_element(); + (* Search the element to close on the stack: *) + let found = ref (name = !current_name) in + Stack.iter + (fun (old_name, _, _) -> + if name = old_name then found := true) + stack; + (* If not found, the end tag is wrong. Simply ignore it. *) + if not !found then + parse_next() + else begin + (* Put the current element on to the stack: *) + Stack.push (!current_name, !current_atts, !current_subs) stack; + (* If found: Remove the elements from the stack, and append + * them to the previous element as sub elements + *) + let rec remove() = + let old_name, old_atts, old_subs = Stack.pop stack in + (* or raise Stack.Empty *) + if old_name = name then + old_name, old_atts, old_subs + else + let older_name, older_atts, older_subs = remove() in + older_name, + older_atts, + (Element (old_name, old_atts, List.rev old_subs) :: older_subs) + in + let old_name, old_atts, old_subs = remove() in + (* Remove one more element: the element containing the element + * currently being closed. + *) + let new_name, new_atts, new_subs = Stack.pop stack in + current_name := new_name; + current_atts := new_atts; + current_subs := (Element (old_name, old_atts, List.rev old_subs)) + :: new_subs; + (* Go on *) + parse_next() + end + | Eof -> + raise End_of_scan + | _ -> + parse_next() + in + try + parse_next(); + List.rev !current_subs + with + End_of_scan -> + (* Close all remaining elements: *) + Stack.push (!current_name, !current_atts, !current_subs) stack; + let rec remove() = + let old_name, old_atts, old_subs = Stack.pop stack in + (* or raise Stack.Empty *) + try + let older_name, older_atts, older_subs = remove() in + older_name, + older_atts, + (Element (old_name, old_atts, List.rev old_subs) :: older_subs) + with + Stack.Empty -> + old_name, old_atts, old_subs + in + let name, atts, subs = remove() in + List.rev subs +;; + + +let parse_string s = + let buf = Lexing.from_string s in + parse_document buf +;; + + +let parse_file fd = + let buf = Lexing.from_channel fd in + parse_document buf +;; + +(* ====================================================================== + * History: + * + * $Log$ + * Revision 1.1 2000/11/17 09:57:28 lpadovan + * Initial revision + * + * Revision 1.1 2000/03/03 01:07:25 gerd + * Initial revision. + * + * + *)