2 * ----------------------------------------------------------------------
6 module Str = Netstring_str;;
8 let cr_or_lf_re = Str.regexp "[\013\n]";;
10 let trim_right_spaces_re =
11 Str.regexp "[ \t]+$";;
13 let trim_left_spaces_re =
14 Str.regexp "^[ \t]+";;
17 Str.regexp "\\([^ \t\r\n:]+\\):\\([ \t]*.*\n\\([ \t].*\n\\)*\\)";;
20 Str.regexp "\013?\n";;
22 let end_of_header_re =
23 Str.regexp "\n\013?\n";;
26 let scan_header ?(unfold=true) parstr ~start_pos:i0 ~end_pos:i1 =
27 let rec parse_header i l =
28 match Str.string_partial_match header_re parstr i with
30 let i' = Str.match_end r in
32 failwith "Mimestring.scan_header";
33 let name = String.lowercase(Str.matched_group r 1 parstr) in
35 Str.matched_group r 2 parstr in
38 let value_with_rspaces =
39 Str.global_replace cr_or_lf_re "" value_with_crlf in
40 let value_with_lspaces =
41 Str.global_replace trim_right_spaces_re "" value_with_rspaces in
42 Str.global_replace trim_left_spaces_re "" value_with_lspaces
46 parse_header i' ( (name,value) :: l)
48 (* The header must end with an empty line *)
49 begin match Str.string_partial_match empty_line_re parstr i with
51 List.rev l, Str.match_end r'
53 failwith "Mimestring.scan_header"
61 | EncodedWord of (string * string * string)
65 | DomainLiteral of string
73 | Recognize_encoded_words
76 type s_extended_token =
80 token_linepos : int; (* Position of the beginning of the line *)
82 mutable token_sep : bool; (* separates adjacent encoded words *)
86 let get_token et = et.token;;
87 let get_pos et = et.token_pos;;
88 let get_line et = et.token_line;;
89 let get_column et = et.token_pos - et.token_linepos;;
90 let get_length et = et.token_len;;
91 let separates_adjacent_encoded_words et = et.token_sep;;
93 let get_decoded_word et =
97 | Control c -> String.make 1 c
98 | Special c -> String.make 1 c
99 | DomainLiteral s -> s
101 | EncodedWord (_, encoding, content) ->
102 ( match encoding with
104 Netencoding.Q.decode content
106 Netencoding.Base64.decode
110 | _ -> failwith "get_decoded_word"
113 failwith "get_decoded_word"
118 EncodedWord (charset, _, _) -> charset
119 | End -> failwith "get_charset"
124 { (* What the user specifies: *)
125 scanner_specials : char list;
126 scanner_options : s_option list;
127 (* Derived from that: *)
128 mutable opt_no_backslash_escaping : bool;
129 mutable opt_return_comments : bool;
130 mutable opt_recognize_encoded_words : bool;
132 mutable is_special : bool array;
133 mutable space_is_special : bool;
137 type scanner_target =
138 { scanned_string : string;
139 mutable scanner_pos : int;
140 mutable scanner_line : int;
141 mutable scanner_linepos : int;
142 (* Position of the beginning of the line *)
143 mutable scanned_tokens : s_extended_token Queue.t;
144 (* A queue of already scanned tokens in order to look ahead *)
145 mutable last_token : s_token;
146 (* The last returned token. It is only important whether it is
147 * EncodedWord or not.
152 type mime_scanner = scanner_spec * scanner_target
155 let get_pos_of_scanner (spec, target) =
156 if spec.opt_recognize_encoded_words then
157 failwith "get_pos_of_scanner"
162 let get_line_of_scanner (spec, target) =
163 if spec.opt_recognize_encoded_words then
164 failwith "get_line_of_scanner"
169 let get_column_of_scanner (spec, target) =
170 if spec.opt_recognize_encoded_words then
171 failwith "get_column_of_scanner"
173 target.scanner_pos - target.scanner_linepos
176 let create_mime_scanner ~specials ~scan_options =
177 let is_spcl = Array.create 256 false in
179 (fun c -> is_spcl.( Char.code c ) <- true)
182 { scanner_specials = specials;
183 scanner_options = scan_options;
184 opt_no_backslash_escaping =
185 List.mem No_backslash_escaping scan_options;
186 opt_return_comments =
187 List.mem Return_comments scan_options;
188 opt_recognize_encoded_words =
189 List.mem Recognize_encoded_words scan_options;
190 is_special = is_spcl;
191 space_is_special = is_spcl.(32);
194 (* Grab the remaining arguments: *)
195 fun ?(pos=0) ?(line=1) ?(column=0) s ->
197 { scanned_string = s;
200 scanner_linepos = pos - column;
201 scanned_tokens = Queue.create();
202 last_token = Comment; (* Must not be initialized with EncodedWord *)
209 let encoded_word_re =
210 Str.regexp "=\\?\\([^?]+\\)\\?\\([^?]+\\)\\?\\([^?]+\\)\\?=";;
212 let scan_next_token ((spec,target) as scn) =
215 token_pos = target.scanner_pos;
216 token_line = target.scanner_line;
217 token_linepos = target.scanner_linepos;
224 (* Note: mk_pair creates a new token pair, and it assumes that
225 * target.scanner_pos (and also scanner_line and scanner_linepos)
226 * still contain the position of the beginning of the token.
229 let s = target.scanned_string in
230 let l = String.length s in
234 if spec.is_special.( Char.code c ) then begin
235 let pair = mk_pair (Special c) 1 in
236 target.scanner_pos <- target.scanner_pos + 1;
239 target.scanner_line <- target.scanner_line + 1;
240 target.scanner_linepos <- target.scanner_pos;
249 scan_qstring (i+1) (i+1) 0
252 let i', line, linepos =
253 scan_comment (i+1) 0 target.scanner_line target.scanner_linepos
256 target.scanner_pos <- i';
257 target.scanner_line <- line;
258 target.scanner_linepos <- linepos
260 if spec.opt_return_comments then begin
261 let pair = mk_pair Comment (i' - i) in
266 if spec.space_is_special then begin
267 let pair = mk_pair (Special ' ') (i' - i) in
276 (* Ignore whitespace by default: *)
277 target.scanner_pos <- target.scanner_pos + 1;
280 (* Ignore whitespace by default: *)
281 target.scanner_pos <- target.scanner_pos + 1;
282 target.scanner_line <- target.scanner_line + 1;
283 target.scanner_linepos <- target.scanner_pos;
285 | ('\000'..'\031'|'\127'..'\255') ->
286 let pair = mk_pair (Control c) 1 in
287 target.scanner_pos <- target.scanner_pos + 1;
290 (* Domain literal: *)
291 scan_dliteral (i+1) (i+1) 0
300 let astring = String.sub s i0 (i-i0) in
302 if spec.opt_recognize_encoded_words then
303 Str.string_match ~groups:4 encoded_word_re astring 0
309 (* An atom contains never a linefeed character, so we can ignore
312 let pair = mk_pair (Atom astring) (i-i0) in
313 target.scanner_pos <- i;
316 (* Found an encoded word. *)
317 let charset = Str.matched_group mr 1 astring in
318 let encoding = Str.matched_group mr 2 astring in
319 let content = Str.matched_group mr 3 astring in
320 let t = EncodedWord(String.uppercase charset,
321 String.uppercase encoding,
323 let pair = mk_pair t (i-i0) in
324 target.scanner_pos <- i;
331 ('\000'..'\031'|'\127'..'\255'|'"'|'('|'['|' '|'\t'|'\r'|'\n') ->
334 if spec.is_special.( Char.code c ) then
341 and scan_qstring i0 i n =
346 (* Regular end of the quoted string: *)
347 let content, line, linepos = copy_qstring i0 (i-1) n in
348 let pair = mk_pair (QString content) (i-i0+2) in
349 target.scanner_pos <- i+1;
350 target.scanner_line <- line;
351 target.scanner_linepos <- linepos;
353 | '\\' when not spec.opt_no_backslash_escaping ->
354 scan_qstring i0 (i+2) (n+1)
356 scan_qstring i0 (i+1) (n+1)
358 (* Missing right double quote *)
359 let content, line, linepos = copy_qstring i0 (l-1) n in
360 let pair = mk_pair (QString content) (l-i0+1) in
361 target.scanner_pos <- l;
362 target.scanner_line <- line;
363 target.scanner_linepos <- linepos;
366 and copy_qstring i0 i1 n =
367 (* Used for quoted strings and for domain literals *)
368 let r = String.create n in
370 let line = ref target.scanner_line in
371 let linepos = ref target.scanner_linepos in
375 '\\' when i < i1 && not spec.opt_no_backslash_escaping -> ()
388 and scan_dliteral i0 i n =
393 (* Regular end of the domain literal: *)
394 let content, line, linepos = copy_qstring i0 (i-1) n in
395 let pair = mk_pair (DomainLiteral content) (i-i0+2) in
396 target.scanner_pos <- i+1;
397 target.scanner_line <- line;
398 target.scanner_linepos <- linepos;
400 | '\\' when not spec.opt_no_backslash_escaping ->
401 scan_dliteral i0 (i+2) (n+1)
403 (* Note: '[' is not allowed by RFC 822; we treat it here as
404 * a regular character (questionable)
406 scan_dliteral i0 (i+1) (n+1)
408 (* Missing right bracket *)
409 let content, line, linepos = copy_qstring i0 (l-1) n in
410 let pair = mk_pair (DomainLiteral content) (l-i0+1) in
411 target.scanner_pos <- l;
412 target.scanner_line <- line;
413 target.scanner_linepos <- linepos;
417 and scan_comment i level line linepos =
425 let i', line', linepos' =
426 scan_comment (i+1) (level+1) line linepos
428 scan_comment i' level line' linepos'
429 | '\\' when not spec.opt_no_backslash_escaping ->
430 if (i+1) < l && s.[i+1] = '\n' then
431 scan_comment (i+2) level (line+1) (i+2)
433 scan_comment (i+2) level line linepos
435 scan_comment (i+1) level (line+1) (i+1)
437 scan_comment (i+1) level line linepos
439 (* Missing closing ')' *)
443 scan target.scanner_pos
447 let scan_token ((spec,target) as scn) =
448 (* This function handles token queueing in order to recognize white space
449 * that separates adjacent encoded words.
452 let rec collect_whitespace () =
453 (* Scans whitespace tokens and returns them as:
454 * (ws_list, other_tok) if there is some other_tok following the
455 * list (other_tok = End is possible)
457 let (et, t) as pair = scan_next_token scn in
459 (Special ' '|Special '\t'|Special '\n'|Special '\r') ->
460 let ws_list, tok = collect_whitespace() in
468 (* Is there an already scanned token in the queue? *)
469 let et = Queue.take target.scanned_tokens in
471 target.last_token <- t;
475 (* If not: inspect the last token. If that token is an EncodedWord,
476 * the next tokens are scanned in advance to determine if there
477 * are spaces separating two EncodedWords. These tokens are put
478 * into the queue such that it is avoided that they are scanned
479 * twice. (The sole purpose of the queue.)
481 match target.last_token with
482 EncodedWord(_,_,_) as ew ->
483 let ws_list, tok = collect_whitespace() in
484 (* If tok is an EncodedWord, too, the tokens in ws_list must
485 * be flagged as separating two adjacent encoded words.
488 _, EncodedWord(_,_,_) ->
491 et.token_sep <- true)
496 (* Anyway, queue the read tokens but the first up *)
499 (* Nothing to queue *)
501 target.last_token <- t;
503 | (et,t) as pair :: ws_list' ->
506 Queue.add et' target.scanned_tokens)
512 Queue.add et' target.scanned_tokens
514 (* Return the first scanned token *)
515 target.last_token <- t;
519 (* Regular case: Scan one token; do not queue it up *)
520 let (et, t) as pair = scan_next_token scn in
521 target.last_token <- t;
526 let scan_token_list scn =
528 match scan_token scn with
538 let scan_structured_value s specials options =
539 let rec collect scn =
540 match scan_token scn with
546 let scn = create_mime_scanner specials options s in
551 let specials_rfc822 =
552 [ '<'; '>'; '@'; ','; ';'; ':'; '\\'; '.' ];;
555 let specials_rfc2045 =
556 [ '<'; '>'; '@'; ','; ';'; ':'; '\\'; '/' ];;
559 let scan_encoded_text_value s =
560 let specials = [ ' '; '\t'; '\r'; '\n'; '('; '['; '"' ] in
561 let options = [ Recognize_encoded_words ] in
562 let scn = create_mime_scanner specials options s in
565 match scan_token scn with
568 | et, _ when separates_adjacent_encoded_words et ->
570 | et, (Special _|Atom _|EncodedWord(_,_,_)) ->
579 let scan_value_with_parameters s options =
580 let rec parse_params tl =
582 Atom n :: Special '=' :: Atom v :: tl' ->
583 (n,v) :: parse_rest tl'
584 | Atom n :: Special '=' :: QString v :: tl' ->
585 (n,v) :: parse_rest tl'
587 failwith "Mimestring.scan_value_with_parameters"
591 | Special ';' :: tl' ->
594 failwith "Mimestring.scan_value_with_parameters"
597 (* Note: Even if not used here, the comma is a very common separator
598 * and should be recognized as being special. You will get a
599 * failure if there is a comma in the scanned string.
601 let tl = scan_structured_value s [ ';'; '='; ',' ] options in
604 | [ QString n ] -> n, []
605 | Atom n :: Special ';' :: tl' ->
607 | QString n :: Special ';' :: tl' ->
610 failwith "Mimestring.scan_value_with_parameters"
614 let scan_mime_type s options =
615 let n, params = scan_value_with_parameters s options in
616 (String.lowercase n),
617 (List.map (fun (n,v) -> (String.lowercase n, v)) params)
621 let lf_re = Str.regexp "[\n]";;
623 let scan_multipart_body s ~start_pos:i0 ~end_pos:i1 ~boundary =
624 let l_s = String.length s in
625 if i0 < 0 or i1 < 0 or i0 > l_s or i1 >l_s then
626 invalid_arg "Mimestring.scan_multipart_body";
628 (* First compile the regexps scanning for 'boundary': *)
630 Str.regexp ("\n--" ^ Str.quote boundary) in
632 Str.regexp ("--" ^ Str.quote boundary) in
635 (* i: Beginning of the current part (position directly after the
638 (* Search for next boundary at position i *)
640 try min (fst (Str.search_forward boundary1_re s i) + 1) i1
644 (* i': Either the position of the first '-' of the boundary line,
645 * or i1 if no boundary has been found
648 [] (* Ignore everything after the last boundary *)
651 try min (fst (Str.search_forward lf_re s i') + 1) i1
655 (* i'': The position after the boundary line *)
657 print_int i; print_newline();
658 print_int i'; print_newline();
659 print_int i''; print_newline();
662 let header, k = scan_header s i i' in
663 (* header: the header of the part
664 * k: beginning of the body
668 (* We know that i'-1 is a linefeed character. i'-2 should be a CR
669 * character. Both characters are not part of the value.
673 '\013' -> String.sub s k (i'-2-k)
674 | _ -> String.sub s k (i'-1-k)
676 String.sub s k (i'-1-k)
689 (* Find the first boundary. This is a special case, because it may be
690 * right at the beginning of the string (no preceding CRLF)
694 if Str.string_partial_match boundary2_re s i0 <> None then
697 try min (fst (Str.search_forward boundary1_re s i0)) i1
706 try min (fst (Str.search_forward lf_re s (i_bnd + 1)) + 1) i1
717 let scan_multipart_body_and_decode s ~start_pos:i0 ~end_pos:i1 ~boundary =
718 let parts = scan_multipart_body s i0 i1 boundary in
720 (fun (params, value) ->
722 try List.assoc "content-transfer-encoding" params
723 with Not_found -> "7bit"
726 (* NOTE: In the case of "base64" and "quoted-printable", the allocation
727 * of the string "value" could be avoided.
732 ("7bit"|"8bit"|"binary") -> value
734 Netencoding.Base64.decode_substring
735 value 0 (String.length value) false true
736 | "quoted-printable" ->
737 Netencoding.QuotedPrintable.decode_substring
738 value 0 (String.length value)
740 failwith "Mimestring.scan_multipart_body_and_decode: Unknown content-transfer-encoding"
748 let scan_multipart_body_from_netstream s ~boundary ~create ~add ~stop =
750 (* The block size of s must be at least the length of the boundary + 3.
751 * Otherwise it is not guaranteed that the boundary is always recognized.
753 if Netstream.block_size s < String.length boundary + 3 then
754 invalid_arg "Mimestring.scan_multipart_body_from_netstream";
756 (* First compile the regexps scanning for 'boundary': *)
758 Str.regexp ("\n--" ^ Str.quote boundary) in
760 Str.regexp ("--" ^ Str.quote boundary) in
762 (* Subtask 1: Search the end of the MIME header: CR LF CR LF
763 * (or LF LF). Enlarge the window until the complete header
764 * is covered by the window.
766 let rec search_end_of_header k =
767 (* Search the end of the header beginning at position k of the
769 * Return the position of the first character of the body.
772 (* Search for LF CR? LF: *)
773 let i, r = Str.search_forward
775 (Netbuffer.unsafe_buffer (Netstream.window s))
778 (* If match_end <= window_length, the search was successful.
779 * Otherwise, we searched in the uninitialized region of the
782 if Str.match_end r <= Netstream.window_length s then
788 (* If the end of the stream is reached, the end of the header
790 * Otherwise, we try to read another block, and continue.
792 if Netstream.at_eos s then
793 failwith "Mimestring.scan_multipart_body_from_netstream: Unexpected end of stream";
794 let w0 = Netstream.window_length s in
795 Netstream.want_another_block s;
796 search_end_of_header (max (w0 - 2) 0)
799 (* Subtask 2: Search the first boundary line. *)
800 let rec search_first_boundary() =
801 (* Search boundary per regexp; return the position of the character
802 * immediately following the boundary (on the same line), or
806 (* Search boundary per regexp: *)
807 let i, r = Str.search_forward
809 (Netbuffer.unsafe_buffer (Netstream.window s))
812 (* If match_end <= window_length, the search was successful.
813 * Otherwise, we searched in the uninitialized region of the
816 if Str.match_end r <= Netstream.window_length s then begin
822 if Netstream.at_eos s then raise Not_found;
823 (* The regexp did not match: Move the window by one block.
827 (Netstream.window_length s)
828 (Netstream.block_size s)
831 search_first_boundary()
834 (* Subtask 3: Search the next boundary line. Invoke 'add' for every
837 let rec search_next_boundary p =
838 (* Returns the position directly after the boundary on the same line *)
840 (* Search boundary per regexp: *)
841 let i,r = Str.search_forward
843 (Netbuffer.unsafe_buffer (Netstream.window s))
846 (* If match_end <= window_length, the search was successful.
847 * Otherwise, we searched in the uninitialized region of the
850 if Str.match_end r <= Netstream.window_length s then begin
851 (* Add the last chunk of the part. *)
853 (* i is a LF. i - 1 should be CR. Ignore these characters. *)
855 match (Netbuffer.unsafe_buffer (Netstream.window s)).[ i - 1 ] with
861 (* Printf.printf "add n=%d\n" n; *)
868 if Netstream.at_eos s then
869 failwith "Mimestring.scan_multipart_body_from_netstream: next MIME boundary not found";
870 (* The regexp did not match: Add the first block of the window;
871 * and move the window.
875 (Netstream.window_length s)
876 (Netstream.block_size s)
878 (* Printf.printf "add n=%d\n" n; *)
881 search_next_boundary p
884 (* Subtask 4: Search the end of the boundary line *)
885 let rec search_end_of_line k =
886 (* Search LF beginning at position k. Discard any contents until that. *)
888 let i,r = Str.search_forward
890 (Netbuffer.unsafe_buffer (Netstream.window s))
893 (* If match_end <= window_length, the search was successful.
894 * Otherwise, we searched in the uninitialized region of the
897 if Str.match_end r <= Netstream.window_length s then begin
903 if Netstream.at_eos s then
904 failwith "Mimestring.scan_multipart_body_from_netstream: MIME boundary without line end";
905 (* The regexp did not match: move the window.
907 let n = Netstream.window_length s in
912 (* Subtask 5: Check whether "--" follows the boundary on the same line *)
913 let check_whether_last_boundary k =
914 (* k: The position directly after the boundary. *)
915 Netstream.want s (k+2);
916 let str = Netbuffer.unsafe_buffer (Netstream.window s) in
917 (Netstream.window_length s >= k+2) && str.[k] = '-' && str.[k+1] = '-'
920 (* Subtask 6: Check whether the buffer begins with a boundary. *)
921 let check_beginning_is_boundary () =
922 let m = String.length boundary + 2 in
924 let str = Netbuffer.unsafe_buffer (Netstream.window s) in
925 (Netstream.window_length s >= m) &&
926 (Str.string_partial_match boundary2_re str 0 <> None)
929 let rec parse_part () =
930 (* The first byte of the current window of s contains the character
931 * directly following the boundary line that starts this part.
933 (* Search the end of the MIME header: *)
934 let k_eoh = search_end_of_header 0 in
935 (* Printf.printf "k_eoh=%d\n" k_eoh; *)
936 (* Get the MIME header: *)
937 let str = Netbuffer.unsafe_buffer (Netstream.window s) in
938 let header, k_eoh' = scan_header str 0 k_eoh in
939 assert (k_eoh = k_eoh');
940 (* Move the window over the header: *)
941 Netstream.move s k_eoh;
942 (* Create the part: *)
943 let p = create header in
946 (* Search the next boundary; add the chunks while searching: *)
947 let k_eob = search_next_boundary p in
948 (* Printf.printf "k_eob=%d\n" k_eob; *)
949 (* Is this the last boundary? *)
950 if check_whether_last_boundary k_eob then begin
952 while not (Netstream.at_eos s) do
953 Netstream.move s (Netstream.window_length s)
955 Netstream.move s (Netstream.window_length s);
959 (* Move to the beginning of the next line: *)
960 let k_eol = search_end_of_line k_eob in
961 Netstream.move s k_eol;
966 (try stop p with _ -> ());
971 (* Continue with next part: *)
975 (* Check whether s directly begins with a boundary: *)
976 if check_beginning_is_boundary() then begin
977 (* Move to the beginning of the next line: *)
978 let k_eol = search_end_of_line 0 in
979 Netstream.move s k_eol;
980 (* Begin with first part: *)
984 (* Search the first boundary: *)
986 let k_eob = search_first_boundary() in
987 (* Printf.printf "k_eob=%d\n" k_eob; *)
988 (* Move to the beginning of the next line: *)
989 let k_eol = search_end_of_line k_eob in
990 (* Printf.printf "k_eol=%d\n" k_eol; *)
991 Netstream.move s k_eol;
992 (* Begin with first part: *)
996 (* No boundary at all: The body is empty. *)
1002 (* ======================================================================
1006 * Revision 1.1 2000/11/17 09:57:27 lpadovan
1009 * Revision 1.8 2000/08/13 00:04:36 gerd
1010 * Encoded_word -> EncodedWord
1013 * Revision 1.7 2000/08/07 00:25:14 gerd
1014 * Implemented the new functions for structured field lexing.
1016 * Revision 1.6 2000/06/25 22:34:43 gerd
1017 * Added labels to arguments.
1019 * Revision 1.5 2000/06/25 21:15:48 gerd
1020 * Checked thread-safety.
1022 * Revision 1.4 2000/05/16 22:30:14 gerd
1023 * Added support for some types of malformed MIME messages.
1025 * Revision 1.3 2000/04/15 13:09:01 gerd
1026 * Implemented uploads to temporary files.
1028 * Revision 1.2 2000/03/02 01:15:30 gerd
1031 * Revision 1.1 2000/02/25 15:21:12 gerd