2 * ----------------------------------------------------------------------
3 * PXP: The polymorphic XML parser for Objective Caml.
4 * Copyright by Gerd Stolpmann. See LICENSE for details.
5 * Some auxiliary functions
8 (**********************************************************************)
17 let character enc warner k =
19 if (k >= 0xd800 & k < 0xe000) or (k >= 0xfffe & k <= 0xffff) or k > 0x10ffff
20 or (k < 8) or (k = 11) or (k = 12) or (k >= 14 & k <= 31)
22 raise (WF_error("Code point " ^ string_of_int k ^
23 " outside the accepted range of code points"));
26 makechar (enc : rep_encoding :> encoding) k
29 warner # warn ("Code point cannot be represented in internal encoding: "
35 let check_name warner name =
36 (* produces a warning for names beginning with "xml". *)
37 if String.length name >= 3 then begin
38 match String.sub name 0 3 with
39 ("xml" | "xmL" | "xMl" | "xML" | "Xml" | "XmL" | "XMl" | "XML") ->
40 warner # warn ("Name is reserved for future extensions: " ^ name)
47 let tokens_of_content_string lexerset s =
48 (* tokenizes general entities and character entities *)
49 let lexbuf = Lexing.from_string s in
50 let rec next_token () =
51 match lexerset.scan_content_string lexbuf with
53 | tok -> tok :: next_token()
59 let rec expand_attvalue_with_rec_check lexerset dtd s warner entities norm_crlf =
60 (* recursively expands general entities and character entities;
61 * checks "standalone" document declaration;
62 * normalizes whitespace
64 let toklist = tokens_of_content_string lexerset s in
69 if List.mem n entities then
70 raise(WF_error("Recursive reference to general entity `" ^ n ^ "'"));
71 let en, extdecl = dtd # gen_entity n in
72 if dtd # standalone_declaration && extdecl then
73 raise(Validation_error("Reference to entity `" ^ n ^
74 "' violates standalone declaration"));
75 let rtext, rtext_contains_ext_refs = en # replacement_text in
76 if rtext_contains_ext_refs then
77 raise(Validation_error("Found reference to external entity in attribute value"));
78 expand_attvalue_with_rec_check
79 lexerset dtd rtext warner (n :: entities) false ^ expand tl'
86 character lexerset.lex_encoding warner n ^ expand tl'
87 | CharData "<" :: tl' ->
90 ("Attribute value contains character '<' literally"))
91 | CharData x :: tl' ->
99 let expand_attvalue lexerset dtd s warner norm_crlf =
100 (* norm_crlf: whether the sequence CRLF is recognized as one character or
101 * not (i.e. two characters)
103 expand_attvalue_with_rec_check lexerset dtd s warner [] norm_crlf
108 (* returns number of lines in s, number of columns of the last line *)
109 let l = String.length s in
111 let rec count n k no_cr no_lf =
116 try String.index_from s k '\013' with Not_found -> (-1) in
121 try String.index_from s k '\010' with Not_found -> (-1) in
122 if next_cr >= 0 & (next_lf < 0 or next_cr < next_lf) then begin
123 if next_cr+1 < l & s.[next_cr+1] = '\010' then
124 count (n+1) (next_cr+2) false (next_lf < 0)
126 count (n+1) (next_cr+1) false (next_lf < 0)
128 else if next_lf >= 0 then begin
129 count (n+1) (next_lf+1) (next_cr < 0) false
135 count 0 0 false false
139 let tokens_of_xml_pi lexers s =
140 let lexbuf = Lexing.from_string (s ^ " ") in
142 let t = lexers.scan_xml_pi lexbuf in
145 | _ -> t :: collect()
151 let decode_xml_pi pl =
152 (* 'pl' must consist of name="value" or name='value' pairs which are returned
154 * The "value" is returned as it is; no substitution of &entities; happens.
158 Pro_name name :: Pro_eq :: Pro_string value :: pl' ->
159 (name, value) :: decode pl'
163 raise (WF_error("Bad XML processing instruction"))
169 let decode_doc_xml_pi pl =
171 [ "version", v ] -> (v, None, None)
172 | [ "version", v; "encoding", e ] -> (v, Some e, None)
173 | [ "version", v; "standalone", s ] -> (v, None, Some s)
174 | [ "version", v; "encoding", e; "standalone", s ] -> (v, Some e, Some s)
176 raise(WF_error("Bad XML declaration"))
180 let check_text_xml_pi pl =
182 | [ "version", v; "encoding", e ] -> ()
183 | [ "encoding", e ] -> ()
185 raise(WF_error("Bad XML declaration"))
189 let check_version_num s =
190 let l = String.length s in
191 for i = 0 to l - 1 do
193 ('a'..'z'|'A'..'Z'|'0'..'9'|
194 '-'|'_'|'.'|':') -> ()
196 raise(WF_error("Bad XML version string"))
201 let check_public_id s =
202 let l = String.length s in
203 for i = 0 to l - 1 do
205 (' '|'\013'|'\010'|'a'..'z'|'A'..'Z'|'0'..'9'|
206 '-'|'\''|'('|')'|'+'|','|'.'|'/'|':'|'='|'?'|
207 ';'|'!'|'*'|'#'|'@'|'$'|'_'|'%') -> ()
209 raise(WF_error("Illegal character in PUBLIC identifier"))
214 (**********************************************************************)
218 let rec check_dups l =
222 if List.mem c l' then true else check_dups l'
226 let rec count pred l =
230 if pred x then 1 + (count pred l') else count pred l'
234 (**********************************************************************)
237 let check_attribute_value_lexically lexerset x t v =
238 (* raises x if the attribute value v does not match the lexical rules
239 * for attribute type t:
240 * - t = A_id: v must be a <name>
241 * - t = A_idref: v must match <name>
242 * - t = A_idrefs: v must match <names>
243 * - t = A_entity: v must match <name>
244 * - t = A_entities: v must match <names>
245 * - t = A_nmtoken: v must match <nmtoken>
246 * - t = A_nmtokens: v must match <nmtokens>
247 * - t = A_notation _: v must match <name>
248 * - t = A_enum _: v must match <nmtoken>
249 * - t = A_cdata: not checked
251 let lexbuf = Lexing.from_string v in
252 let rec get_name_list() =
253 match lexerset.scan_name_string lexbuf with
255 | Ignore -> get_name_list()
256 | tok -> tok :: get_name_list()
258 let l = get_name_list() in
260 (A_id | A_idref | A_entity | A_notation _) ->
263 | _ -> raise (Lazy.force x)
265 | (A_idrefs | A_entities) ->
266 if List.exists (fun tok ->
271 | (A_nmtoken | A_enum _) ->
274 | [ Nametoken n ] -> ()
275 | _ -> raise (Lazy.force x)
278 if List.exists (fun tok ->
281 | Nametoken _ -> false
289 let split_attribute_value lexerset v =
290 (* splits 'v' into a list of names or nmtokens. The white space separating
291 * the names/nmtokens in 'v' is suppressed and not returned.
293 let lexbuf = Lexing.from_string v in
294 let rec get_name_list() =
295 match lexerset.scan_name_string lexbuf with
297 | Ignore -> get_name_list()
298 | Name s -> s :: get_name_list()
299 | Nametoken s -> s :: get_name_list()
300 | _ -> raise(Validation_error("Illegal attribute value"))
306 let normalize_line_separators lexerset s =
307 let lexbuf = Lexing.from_string s in
308 let rec get_string() =
309 match lexerset.scan_for_crlf lexbuf with
311 | CharData s -> s ^ get_string()
318 let value_of_attribute lexerset dtd n atype v =
319 (* The attribute with name 'n', type 'atype' and string value 'v' is
320 * decomposed, and the att_value is returned:
321 * - It is checked whether 'v' conforms to the lexical rules for attributes
323 * - If 'atype <> A_cdata', leading and trailing spaces are removed from 'v'.
324 * - If 'atype = A_notation d', it is checked if 'v' matches one of the
325 * notation names contained in d.
326 * - If 'atype = A_enum d', it is checked whether 'v' matches one of the
328 * - If 'atype' refers to a "single-value" type, the value is retured as
329 * Value u, where u is the normalized value. If 'atype' refers to a
330 * "list" type, the value if returned as Valuelist l, where l contains
333 * Note that this function does not implement all normalization rules.
334 * It is expected that the string passed as 'v' is already preprocessed;
335 * i.e. character and entity references are resolved, and the substitution
336 * of white space characters by space characters has already been performed.
337 * If these requirements are met, the value returned by this function
338 * will be perfectly normalized.
341 * - ENTITY and ENTITIES values: It is checked whether there is an
342 * unparsed general entity
343 * [ Other checks planned: ID, IDREF, IDREFS but not yet implemented ]
346 let lexical_error() =
347 lazy (raise(Validation_error("Attribute `" ^ n ^ "' is lexically malformed"))) in
349 let remove_leading_and_trailing_spaces u =
350 (* Precondition: 'u' matches <name> or <nmtoken> *)
351 match split_attribute_value lexerset u with
356 let check_ndata_entity u =
357 let en, extdecl = dtd # gen_entity u in (* or Validation_error *)
358 if not (en # is_ndata) then
359 raise(Validation_error("Reference to entity `" ^ u ^
360 "': NDATA entity expected"));
361 if dtd # standalone_declaration && extdecl then
362 raise(Validation_error("Reference to entity `" ^ u ^
363 "' violates standalone declaration"));
370 | (A_id | A_idref | A_nmtoken) ->
371 check_attribute_value_lexically lexerset (lexical_error()) atype v;
372 Value (remove_leading_and_trailing_spaces v)
374 check_attribute_value_lexically lexerset (lexical_error()) atype v;
375 let v' = remove_leading_and_trailing_spaces v in
376 check_ndata_entity v';
379 | (A_idrefs | A_nmtokens) ->
380 check_attribute_value_lexically lexerset (lexical_error()) atype v;
381 Valuelist (split_attribute_value lexerset v)
384 check_attribute_value_lexically lexerset (lexical_error()) atype v;
385 let l = split_attribute_value lexerset v in
386 List.iter check_ndata_entity l;
390 check_attribute_value_lexically lexerset (lexical_error()) atype v;
391 let v' = remove_leading_and_trailing_spaces v in
392 if not (List.mem v' nl) then
393 raise(Validation_error
395 "' does not match one of the declared notation names"));
399 check_attribute_value_lexically lexerset (lexical_error()) atype v;
400 let v' = remove_leading_and_trailing_spaces v in
401 if not (List.mem v' enuml) then
402 raise(Validation_error
404 "' does not match one of the declared enumerator tokens"));
409 let normalization_changes_value lexerset atype v =
411 * - 'atype' is a "single-value" type, and the normalization of the string
412 * value 'v' of this type discards leading and/or trailing spaces
413 * - 'atype' is a "list" type, and the normalization of the string value
414 * 'v' of this type discards leading and/or trailing spaces, or spaces
415 * separating the tokens of the list (i.e. the normal form is that
416 * the tokens are separated by exactly one space character).
418 * Note: It is assumed that TABs, CRs, and LFs in 'v' are already converted
426 | (A_id | A_idref | A_entity | A_nmtoken | A_notation _ | A_enum _) ->
427 (* Return 'true' if the first or last character is a space.
428 * The following check works for both ISO-8859-1 and UTF-8.
430 v <> "" && (v.[0] = ' ' || v.[String.length v - 1] = ' ')
432 | (A_idrefs | A_entities | A_nmtokens) ->
433 (* Split the list, and concatenate the tokens as required by
434 * the normal form. Return 'true' if this operation results in
435 * a different string than 'v'.
436 * This check works for both ISO-8859-1 and UTF-8.
438 let l = split_attribute_value lexerset v in
439 let v' = String.concat " " l in
444 (**********************************************************************)
446 let write_markup_string ~(from_enc:rep_encoding) ~to_enc os s =
447 (* Write the 'from_enc'-encoded string 's' as 'to_enc'-encoded string to
448 * 'os'. All characters are written as they are.
451 if to_enc = (from_enc :> encoding)
454 ~in_enc:(from_enc :> encoding)
458 ("Pxp_aux.write_markup_string: Cannot represent " ^
459 "code point " ^ string_of_int n))
462 write os s' 0 (String.length s')
466 let write_data_string ~(from_enc:rep_encoding) ~to_enc os content =
467 (* Write the 'from_enc'-encoded string 's' as 'to_enc'-encoded string to
468 * 'os'. The characters '&', '<', '>', '"', '%' and every character that
469 * cannot be represented in 'to_enc' are paraphrased as entity reference
472 let convert_ascii s =
473 (* Convert the ASCII-encoded string 's'. Note that 'from_enc' is
474 * always ASCII-compatible
476 if to_enc = (from_enc :> encoding)
480 ~in_enc:(from_enc :> encoding)
482 ~subst:(fun n -> assert false)
487 (* Write the ASCII-encoded string 's' *)
488 let s' = convert_ascii s in
489 write os s' 0 (String.length s')
493 (* Writes the substring of 'content' beginning at pos 'j' with length 'l'
495 if to_enc = (from_enc :> encoding) then
498 let s' = recode_string
499 ~in_enc:(from_enc :> encoding)
502 convert_ascii ("&#" ^ string_of_int n ^ ";"))
503 (String.sub content j l)
505 write os s' 0 (String.length s')
510 for k = 0 to String.length content - 1 do
511 match content.[k] with
512 ('&' | '<' | '>' | '"' | '%') as c ->
514 write_part !i (k - !i);
516 '&' -> write_ascii "&"
517 | '<' -> write_ascii "<"
518 | '>' -> write_ascii ">"
519 | '"' -> write_ascii """
520 | '%' -> write_ascii "%" (* reserved in DTDs *)
526 if !i < String.length content then
527 write_part !i (String.length content - !i)
531 (* ======================================================================
535 * Revision 1.1 2000/11/17 09:57:29 lpadovan
538 * Revision 1.6 2000/08/14 22:24:55 gerd
539 * Moved the module Pxp_encoding to the netstring package under
540 * the new name Netconversion.
542 * Revision 1.5 2000/07/25 00:30:01 gerd
543 * Added support for pxp:dtd PI options.
545 * Revision 1.4 2000/07/16 18:31:09 gerd
546 * The exception Illegal_character has been dropped.
548 * Revision 1.3 2000/07/16 16:33:57 gerd
549 * New function write_markup_string: Handles the encoding
552 * Revision 1.2 2000/07/08 22:15:45 gerd
553 * [Merging 0.2.10:] write_data_string: The character '%' is special, too.
555 * Revision 1.1 2000/05/29 23:48:38 gerd
556 * Changed module names:
557 * Markup_aux into Pxp_aux
558 * Markup_codewriter into Pxp_codewriter
559 * Markup_document into Pxp_document
560 * Markup_dtd into Pxp_dtd
561 * Markup_entity into Pxp_entity
562 * Markup_lexer_types into Pxp_lexer_types
563 * Markup_reader into Pxp_reader
564 * Markup_types into Pxp_types
565 * Markup_yacc into Pxp_yacc
566 * See directory "compatibility" for (almost) compatible wrappers emulating
567 * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
569 * ======================================================================
570 * Old logs from markup_aux.ml:
572 * Revision 1.12 2000/05/27 19:08:30 gerd
573 * Added functionality to check standalone declaration:
575 * expand_attvalue: Checks whether included entities violate the
576 * stand-alone declaration.
578 * value_of_attribute: Checks whether ENTITY/ENTITIES values violate
579 * this declaration. (Furthermore, it is checked whether the NDATA
580 * entity exists - this has been forgotten in previous versions.)
582 * value_of_attribute/check_attribute_value_lexically: improved.
584 * New function normalization_changes_value: helps detecting
585 * one case which violates the standalone declaration.
587 * Revision 1.11 2000/05/20 20:31:40 gerd
588 * Big change: Added support for various encodings of the
589 * internal representation.
591 * Revision 1.10 2000/05/01 20:41:56 gerd
592 * New function write_data_string.
594 * Revision 1.9 2000/04/30 18:11:31 gerd
595 * New function normalize_line_separators.
596 * In function expand_attvalue: New argument norm_crlf. If the attvalue
597 * is read directly from a file, the sequence CR LF must be converted to a
598 * single space. If the attvalue is read from a replacement text, CR LF has
599 * already converted to a single LF, and CR LF, if still occurring, must be
600 * converted to two spaces. The caller can indicate the case by passing
601 * true/false as norm_crlf.
603 * Revision 1.8 1999/09/01 22:51:07 gerd
605 * 'character' raises Illegal_character if characters are found that
606 * do not match the production Char.
608 * Revision 1.7 1999/09/01 16:17:37 gerd
609 * Added function 'check_name'.
611 * Revision 1.6 1999/08/15 20:33:19 gerd
612 * Added: a function that checks public identifiers. Only certain
613 * characters may occur in these identifiers.
614 * Control characters are rejected by the "character" function.
615 * Bugfix: recursive entity references are detected in attribute
618 * Revision 1.5 1999/08/15 02:18:02 gerd
619 * That '<' is not allowed in attribute values, is a violation
620 * of well-formedness, not of the validity; so WF_error is raised.
622 * Revision 1.4 1999/08/15 00:20:37 gerd
623 * When expanding attribute values, references to parameter
624 * entities are now resolved by the method "replacement_text" which
625 * has an additional return value, and no longer by "attlist_replacement_text".
626 * The new return value indicates whether references to external entities
627 * have been resolved (directly or indirectly); this is allowed at some
628 * locations but not in attribute values.
630 * Revision 1.3 1999/08/14 22:05:53 gerd
631 * Several functions have now a "warner" as argument which is
632 * an object with a "warn" method. This is used to warn about characters
633 * that cannot be represented in the Latin 1 alphabet.
635 * Revision 1.2 1999/08/10 21:35:06 gerd
636 * The XML/encoding declaration at the beginning of entities is
637 * evaluated. In particular, entities have now a method "xml_declaration"
638 * which returns the name/value pairs of such a declaration. The "encoding"
639 * setting is interpreted by the entity itself; "version", and "standalone"
640 * are interpreted by Markup_yacc.parse_document_entity. Other settings
641 * are ignored (this does not conform to the standard; the standard prescribes
642 * that "version" MUST be given in the declaration of document; "standalone"
643 * and "encoding" CAN be declared; no other settings are allowed).
644 * TODO: The user should be warned if the standard is not exactly
645 * fulfilled. -- The "standalone" property is not checked yet.
647 * Revision 1.1 1999/08/10 00:35:50 gerd