1 #!/usr/bin/ocamlrun /usr/bin/ocaml
5 with Invalid_argument _ ->
6 prerr_endline "Usage: split.ml <FILE.html>";
11 #require "pxp-engine";;
12 #require "pxp-lex-utf8";;
15 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>
16 <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">
17 <html xmlns=\"http://www.w3.org/1999/xhtml\">
27 ('a Pxp_document.node #Pxp_document.extension as 'a) Pxp_document.node
30 Pxp_reader.lookup_public_id_as_file [
31 "-//W3C//DTD XHTML 1.0 Transitional//EN", "xhtml1-transitional.dtd";
32 "-//W3C//ENTITIES Latin 1 for XHTML//EN", "xhtml-lat1.ent";
33 "-//W3C//ENTITIES Symbols for XHTML//EN", "xhtml-symbol.ent";
34 "-//W3C//ENTITIES Special for XHTML//EN", "xhtml-special.ent"; ]
37 Pxp_tree_parser.parse_wfdocument_entity
38 { Pxp_types.default_config with Pxp_types.encoding = `Enc_utf8 }
39 (Pxp_types.from_file ~alt:[ resolver ] fname)
40 Pxp_tree_parser.default_spec
42 (** pattern matching like predicate on pxp nodes *)
43 let match_elt tag attr_name ?attr_value () node =
44 node#node_type = Pxp_document.T_element tag
45 && (match attr_value with
47 (try node#attribute attr_name = Pxp_types.Value attr_value
48 with Not_found -> false)
49 | None -> List.mem attr_name node#attribute_names)
54 Pxp_document.find ~deeply:true
55 (match_elt "div" "class" ~attr_value:"article" ()) doc#root
56 with Not_found -> failwith "Can't find article <div>" in
58 Pxp_document.find ~deeply:false
59 (match_elt "div" "class" ~attr_value:"titlepage" ()) article in
61 Pxp_document.find ~deeply:false
62 (match_elt "div" "class" ~attr_value:"toc" ()) article in
64 Pxp_document.find_all ~deeply:false
65 (match_elt "div" "class" ~attr_value:"sect1" ()) article in
66 titlepage :: toc :: secs
68 let localize_ids secs =
69 let id2sec = Hashtbl.create 1023 in
70 let sec_ids = ref [] in
73 match Pxp_document.find_all ~deeply:true (match_elt "a" "id" ()) sec with
75 let sec_id = sec_id#required_string_attribute "id" in
76 sec_ids := sec_id :: !sec_ids;
79 let id = id#required_string_attribute "id" in
80 Hashtbl.add id2sec id sec_id)
82 | _ -> failwith "can't find section id")
86 let fname_of_sec sec_name = sec_name ^ ".html"
88 let get_sec_name sec =
91 Pxp_document.find ~deeply:true (match_elt "a" "id" ()) sec
92 with Not_found -> failwith "can't find section id" in
93 id#required_string_attribute "id"
95 let iter_xrefs f node =
96 let a_s = Pxp_document.find_all ~deeply:true (match_elt "a" "href" ()) node in
98 (fun (node: 'a node) ->
99 let href = node#required_string_attribute "href" in
100 assert (String.length href > 0);
101 if href.[0] = '#' then
102 let xref = String.sub href 1 (String.length href - 1) in
106 let patch_toc sec_ids id2sec toc =
109 if List.mem xref sec_ids then
110 node#set_attribute "href" (Pxp_types.Value (fname_of_sec xref))
113 let sec = Hashtbl.find id2sec xref in
114 node#set_attribute "href"
115 (Pxp_types.Value (fname_of_sec sec ^ "#" ^ xref))
116 with Not_found -> ())
119 (* let a_s = Pxp_document.find_all ~deeply:true (match_elt "a" "href" ()) toc in
121 (fun (node: 'a node) ->
122 let href = node#required_string_attribute "href" in
123 assert (String.length href > 0);
124 if href.[0] = '#' then
125 let xref = String.sub href 1 (String.length href - 1) in
126 if List.mem xref sec_ids then
127 node#set_attribute "href" (Pxp_types.Value (fname_of_sec xref))
130 let sec = Hashtbl.find id2sec xref in
131 node#set_attribute "href"
132 (Pxp_types.Value (fname_of_sec sec ^ "#" ^ xref))
133 with Not_found -> ())
136 let patch_sec sec_ids id2sec sec =
137 let sec_name = get_sec_name sec in
141 let xref_sec = Hashtbl.find id2sec xref in
142 if xref_sec <> sec_name then
143 node#set_attribute "href"
144 (Pxp_types.Value (fname_of_sec xref_sec ^ "#" ^ xref))
145 with Not_found -> ())
148 let open_formatter fname =
149 Unix.open_process_out (Printf.sprintf "xmllint --format -o %s -" fname)
151 let close_formatter oc = ignore (Unix.close_process_out oc)
153 let output_index (titlepage: 'a node) (toc: 'a node) fname =
154 let oc = open_formatter fname in
155 output_string oc xhtml_header;
156 titlepage#write (`Out_channel oc) `Enc_utf8;
157 toc#write (`Out_channel oc) `Enc_utf8;
158 output_string oc xhtml_trailer;
161 let output_sec (sec: 'a node) fname =
162 let oc = open_formatter fname in
163 output_string oc xhtml_header;
164 sec#write (`Out_channel oc) `Enc_utf8;
165 output_string oc xhtml_trailer;
169 let doc = parse_xml fname in
171 | titlepage :: toc :: secs ->
172 let sec_ids, id2sec = localize_ids secs in
173 patch_toc sec_ids id2sec toc;
174 List.iter (patch_sec sec_ids id2sec) secs;
175 output_index titlepage toc "index.html";
176 List.iter (fun sec -> output_sec sec (get_sec_name sec ^ ".html")) secs
177 | _ -> failwith "Unrecognized document structure"