]> matita.cs.unibo.it Git - helm.git/blobdiff - helm/DEVEL/pxp/netstring/nethtml.ml
Initial revision
[helm.git] / helm / DEVEL / pxp / netstring / nethtml.ml
diff --git a/helm/DEVEL/pxp/netstring/nethtml.ml b/helm/DEVEL/pxp/netstring/nethtml.ml
new file mode 100644 (file)
index 0000000..7f9d983
--- /dev/null
@@ -0,0 +1,276 @@
+(* $Id$
+ * ----------------------------------------------------------------------
+ *
+ *)
+
+open Nethtml_scanner;;
+
+type document =
+    Element of (string  *  (string*string) list  *  document list)
+  | Data of string
+;;
+
+
+exception End_of_scan;;
+
+
+let no_end_tag =  (* empty HTML elements *)
+  ref
+    [ "isindex";
+      "base";
+      "meta";
+      "link";
+      "hr";
+      "input";
+      "img";
+      "param";
+      "basefont";
+      "br";
+      "area";
+    ]
+;;
+
+
+let special_tag =   (* other lexical rules *)
+  ref
+    [ "script";
+      "style";
+    ]
+;;
+
+
+let rec parse_comment buf =
+  let t = scan_comment buf in
+  match t with
+      Mcomment ->
+       parse_comment buf
+    | Eof ->
+       raise End_of_scan
+    | _ ->
+       ()
+;;
+
+
+let rec parse_doctype buf =
+  let t = scan_doctype buf in
+  match t with
+      Mdoctype ->
+       parse_doctype buf
+    | Eof ->
+       raise End_of_scan
+    | _ ->
+       ()
+;;
+
+
+let parse_document buf =
+  let current_name = ref "" in
+  let current_atts = ref [] in
+  let current_subs = ref [] in
+  let stack = Stack.create() in
+
+  let parse_atts() =
+    let rec next_no_space() =
+      match scan_element buf with
+         Space _ -> next_no_space()
+       | t -> t
+    in
+
+    let rec parse_atts_lookahead next =
+      match next with
+         Relement -> []
+       | Name n ->
+           begin match next_no_space() with
+               Is ->
+                 begin match next_no_space() with
+                     Name v ->
+                       (String.lowercase n, String.uppercase v) ::
+                       parse_atts_lookahead (next_no_space())
+                   | Literal v ->
+                       (String.lowercase n,v) ::
+                       parse_atts_lookahead (next_no_space())
+                   | Eof ->
+                       raise End_of_scan
+                   | Relement ->
+                       (* Illegal *)
+                       []
+                   | _ ->
+                       (* Illegal *)
+                       parse_atts_lookahead (next_no_space())
+                 end
+             | Eof ->
+                 raise End_of_scan
+             | Relement ->
+                 (* <tag name> <==> <tag name="name"> *)
+                 [ String.lowercase n, String.lowercase n ]
+             | next' ->
+                 (* assume <tag name ... > <==> <tag name="name" ...> *)
+                 ( String.lowercase n, String.lowercase n ) ::
+                 parse_atts_lookahead next'
+           end
+       | Eof ->
+           raise End_of_scan
+       | _ ->
+           (* Illegal *)
+           parse_atts_lookahead (next_no_space())
+    in
+    parse_atts_lookahead (next_no_space())
+  in
+
+  let rec parse_special name =
+    (* Parse until </name> *)
+    match scan_special buf with
+       Lelementend n ->
+         if n = name then
+           ""
+         else
+           "</" ^ n ^ parse_special name
+      | Eof ->
+         raise End_of_scan
+      | Cdata s ->
+         s ^ parse_special name
+      | _ ->
+         (* Illegal *)
+         parse_special name
+  in
+
+  let rec skip_element() =
+    (* Skip until ">" *)
+    match scan_element buf with
+       Relement ->
+         ()
+      | Eof ->
+         raise End_of_scan
+      | _ ->
+         skip_element()
+  in
+
+  let rec parse_next() =
+    let t = scan_document buf in
+    match t with
+       Lcomment ->
+         parse_comment buf;
+         parse_next()
+      | Ldoctype ->
+         parse_doctype buf;
+         parse_next()
+      | Lelement name ->
+         let name = String.lowercase name in
+         if List.mem name !no_end_tag then begin
+           let atts = parse_atts() in
+           current_subs := (Element(name, atts, [])) :: !current_subs;
+           parse_next()
+         end
+         else if List.mem name !special_tag then begin
+           let atts = parse_atts() in
+           let data = parse_special name in
+           (* Read until ">" *)
+           skip_element();
+           current_subs := (Element(name, atts, [Data data])) :: !current_subs;
+           parse_next()
+         end
+         else begin
+           let atts = parse_atts() in
+           Stack.push (!current_name, !current_atts, !current_subs) stack;
+           current_name := name;
+           current_atts := atts;
+           current_subs := [];
+           parse_next()
+         end
+      | Cdata data ->
+         current_subs := (Data data) :: !current_subs;
+         parse_next()
+      | Lelementend name ->
+         let name = String.lowercase name in
+         (* Read until ">" *)
+         skip_element();
+         (* Search the element to close on the stack: *)
+         let found = ref (name = !current_name) in
+         Stack.iter
+           (fun (old_name, _, _) ->
+              if name = old_name then found := true)
+           stack;
+         (* If not found, the end tag is wrong. Simply ignore it. *)
+         if not !found then
+           parse_next()
+         else begin
+           (* Put the current element on to the stack: *)
+           Stack.push (!current_name, !current_atts, !current_subs) stack;
+           (* If found: Remove the elements from the stack, and append
+            * them to the previous element as sub elements
+            *)
+           let rec remove() =
+             let old_name, old_atts, old_subs = Stack.pop stack in
+               (* or raise Stack.Empty *)
+             if old_name = name then
+               old_name, old_atts, old_subs
+             else
+               let older_name, older_atts, older_subs = remove() in
+               older_name,
+               older_atts,
+               (Element (old_name, old_atts, List.rev old_subs) :: older_subs)
+           in
+           let old_name, old_atts, old_subs = remove() in
+           (* Remove one more element: the element containing the element
+            * currently being closed.
+            *)
+           let new_name, new_atts, new_subs = Stack.pop stack in
+           current_name := new_name;
+           current_atts := new_atts;
+           current_subs := (Element (old_name, old_atts, List.rev old_subs))
+                            :: new_subs;
+           (* Go on *)
+           parse_next()
+         end
+      | Eof ->
+         raise End_of_scan
+      | _ ->
+         parse_next()
+  in
+  try
+    parse_next();
+    List.rev !current_subs
+  with
+      End_of_scan ->
+       (* Close all remaining elements: *)
+       Stack.push (!current_name, !current_atts, !current_subs) stack;
+       let rec remove() =
+         let old_name, old_atts, old_subs = Stack.pop stack in
+               (* or raise Stack.Empty *)
+         try
+           let older_name, older_atts, older_subs = remove() in
+           older_name,
+           older_atts,
+           (Element (old_name, old_atts, List.rev old_subs) :: older_subs)
+         with
+             Stack.Empty ->
+               old_name, old_atts, old_subs
+       in
+       let name, atts, subs = remove() in
+       List.rev subs
+;;
+
+
+let parse_string s =
+  let buf = Lexing.from_string s in
+  parse_document buf
+;;
+
+
+let parse_file fd =
+  let buf = Lexing.from_channel fd in
+  parse_document buf
+;;
+
+(* ======================================================================
+ * History:
+ * 
+ * $Log$
+ * Revision 1.1  2000/11/17 09:57:28  lpadovan
+ * Initial revision
+ *
+ * Revision 1.1  2000/03/03 01:07:25  gerd
+ *     Initial revision.
+ *
+ * 
+ *)