helm/DEVEL/pxp/netstring/mimestring.ml

   1 (* $Id$
   2  * ----------------------------------------------------------------------
   3  *
   4  *)
   5
   6 module Str = Netstring_str;;
   7
   8 let cr_or_lf_re = Str.regexp "[\013\n]";;
   9
  10 let trim_right_spaces_re =
  11   Str.regexp "[ \t]+$";;
  12
  13 let trim_left_spaces_re =
  14   Str.regexp "^[ \t]+";;
  15
  16 let header_re =
  17   Str.regexp "\\([^ \t\r\n:]+\\):\\([ \t]*.*\n\\([ \t].*\n\\)*\\)";;
  18
  19 let empty_line_re =
  20   Str.regexp "\013?\n";;
  21
  22 let end_of_header_re =
  23   Str.regexp "\n\013?\n";;
  24
  25
  26 let scan_header ?(unfold=true) parstr ~start_pos:i0 ~end_pos:i1 =
  27   let rec parse_header i l =
  28     match Str.string_partial_match header_re parstr i with
  29         Some r ->
  30           let i' = Str.match_end r in
  31           if i' > i1 then
  32             failwith "Mimestring.scan_header";
  33           let name = String.lowercase(Str.matched_group r 1 parstr) in
  34           let value_with_crlf =
  35             Str.matched_group r 2 parstr in
  36           let value =
  37             if unfold then begin
  38               let value_with_rspaces =
  39                 Str.global_replace cr_or_lf_re "" value_with_crlf in
  40               let value_with_lspaces =
  41                 Str.global_replace trim_right_spaces_re "" value_with_rspaces in
  42               Str.global_replace trim_left_spaces_re "" value_with_lspaces
  43             end
  44             else value_with_crlf
  45           in
  46           parse_header i' ( (name,value) :: l)
  47       | None ->
  48           (* The header must end with an empty line *)
  49           begin match Str.string_partial_match empty_line_re parstr i with
  50               Some r' ->
  51                 List.rev l, Str.match_end r'
  52             | None ->
  53                 failwith "Mimestring.scan_header"
  54           end
  55   in
  56   parse_header i0 []
  57 ;;
  58
  59 type s_token =
  60     Atom of string
  61   | EncodedWord of (string * string * string)
  62   | QString of string
  63   | Control of char
  64   | Special of char
  65   | DomainLiteral of string
  66   | Comment
  67   | End
  68 ;;
  69
  70 type s_option =
  71     No_backslash_escaping
  72   | Return_comments
  73   | Recognize_encoded_words
  74 ;;
  75
  76 type s_extended_token =
  77     { token      : s_token;
  78       token_pos  : int;
  79       token_line : int;
  80       token_linepos : int;   (* Position of the beginning of the line *)
  81       token_len  : int;
  82       mutable token_sep : bool; (* separates adjacent encoded words *)
  83     }
  84 ;;
  85
  86 let get_token et  = et.token;;
  87 let get_pos et    = et.token_pos;;
  88 let get_line et   = et.token_line;;
  89 let get_column et = et.token_pos - et.token_linepos;;
  90 let get_length et = et.token_len;;
  91 let separates_adjacent_encoded_words et = et.token_sep;;
  92
  93 let get_decoded_word et =
  94   match et.token with
  95       Atom s -> s
  96     | QString s -> s
  97     | Control c -> String.make 1 c
  98     | Special c -> String.make 1 c
  99     | DomainLiteral s -> s
 100     | Comment -> ""
 101     | EncodedWord (_, encoding, content) ->
 102         ( match encoding with
 103               ("Q"|"q") ->
 104                 Netencoding.Q.decode content
 105             | ("B"|"b") ->
 106                 Netencoding.Base64.decode
 107                   ~url_variant:false
 108                   ~accept_spaces:false
 109                   content
 110             | _ -> failwith "get_decoded_word"
 111         )
 112     | End ->
 113         failwith "get_decoded_word"
 114 ;;
 115
 116 let get_charset et =
 117   match et.token with
 118       EncodedWord (charset, _, _) -> charset
 119     | End -> failwith "get_charset"
 120     | _ -> "US-ASCII"
 121 ;;
 122
 123 type scanner_spec =
 124     { (* What the user specifies: *)
 125       scanner_specials : char list;
 126       scanner_options : s_option list;
 127       (* Derived from that: *)
 128       mutable opt_no_backslash_escaping : bool;
 129       mutable opt_return_comments : bool;
 130       mutable opt_recognize_encoded_words : bool;
 131
 132       mutable is_special : bool array;
 133       mutable space_is_special : bool;
 134     }
 135 ;;
 136
 137 type scanner_target =
 138     { scanned_string : string;
 139       mutable scanner_pos : int;
 140       mutable scanner_line : int;
 141       mutable scanner_linepos : int;
 142       (* Position of the beginning of the line *)
 143       mutable scanned_tokens : s_extended_token Queue.t;
 144       (* A queue of already scanned tokens in order to look ahead *)
 145       mutable last_token : s_token;
 146       (* The last returned token. It is only important whether it is
 147        * EncodedWord or not.
 148        *)
 149     }
 150 ;;
 151
 152 type mime_scanner = scanner_spec * scanner_target
 153 ;;
 154
 155 let get_pos_of_scanner (spec, target) =
 156   if spec.opt_recognize_encoded_words then
 157     failwith "get_pos_of_scanner"
 158   else
 159     target.scanner_pos
 160 ;;
 161
 162 let get_line_of_scanner (spec, target) =
 163   if spec.opt_recognize_encoded_words then
 164     failwith "get_line_of_scanner"
 165   else
 166     target.scanner_line
 167 ;;
 168
 169 let get_column_of_scanner (spec, target) =
 170   if spec.opt_recognize_encoded_words then
 171     failwith "get_column_of_scanner"
 172   else
 173     target.scanner_pos - target.scanner_linepos
 174 ;;
 175
 176 let create_mime_scanner ~specials ~scan_options =
 177   let is_spcl = Array.create 256 false in
 178   List.iter
 179     (fun c -> is_spcl.( Char.code c ) <- true)
 180     specials;
 181   let spec =
 182     { scanner_specials = specials;
 183       scanner_options = scan_options;
 184       opt_no_backslash_escaping =
 185         List.mem No_backslash_escaping scan_options;
 186       opt_return_comments =
 187         List.mem Return_comments scan_options;
 188       opt_recognize_encoded_words =
 189         List.mem Recognize_encoded_words scan_options;
 190       is_special = is_spcl;
 191       space_is_special = is_spcl.(32);
 192     }
 193   in
 194   (* Grab the remaining arguments: *)
 195   fun ?(pos=0) ?(line=1) ?(column=0) s ->
 196     let target =
 197       { scanned_string = s;
 198         scanner_pos = pos;
 199         scanner_line = line;
 200         scanner_linepos = pos - column;
 201         scanned_tokens = Queue.create();
 202         last_token = Comment;   (* Must not be initialized with EncodedWord *)
 203       }
 204     in
 205     spec, target
 206 ;;
 207
 208
 209 let encoded_word_re =
 210   Str.regexp "=\\?\\([^?]+\\)\\?\\([^?]+\\)\\?\\([^?]+\\)\\?=";;
 211
 212 let scan_next_token ((spec,target) as scn) =
 213   let mk_pair t len =
 214     { token = t;
 215       token_pos = target.scanner_pos;
 216       token_line = target.scanner_line;
 217       token_linepos = target.scanner_linepos;
 218       token_len = len;
 219       token_sep = false;
 220     },
 221     t
 222   in
 223
 224   (* Note: mk_pair creates a new token pair, and it assumes that
 225    * target.scanner_pos (and also scanner_line and scanner_linepos)
 226    * still contain the position of the beginning of the token.
 227    *)
 228
 229   let s = target.scanned_string in
 230   let l = String.length s in
 231   let rec scan i =
 232     if i < l then begin
 233       let c = s.[i] in
 234       if spec.is_special.( Char.code c ) then begin
 235         let pair = mk_pair (Special c) 1 in
 236         target.scanner_pos <- target.scanner_pos + 1;
 237         (match c with
 238              '\n' ->
 239                target.scanner_line    <- target.scanner_line + 1;
 240                target.scanner_linepos <- target.scanner_pos;
 241            | _ -> ()
 242         );
 243         pair
 244       end
 245       else
 246         match c with
 247             '"' ->
 248               (* Quoted string: *)
 249               scan_qstring (i+1) (i+1) 0
 250           | '(' ->
 251               (* Comment: *)
 252               let i', line, linepos =
 253                 scan_comment (i+1) 0 target.scanner_line target.scanner_linepos
 254               in
 255               let advance() =
 256                 target.scanner_pos <- i';
 257                 target.scanner_line <- line;
 258                 target.scanner_linepos <- linepos
 259               in
 260               if spec.opt_return_comments then begin
 261                 let pair = mk_pair Comment (i' - i) in
 262                 advance();
 263                 pair
 264               end
 265               else
 266                 if spec.space_is_special then begin
 267                   let pair = mk_pair (Special ' ') (i' - i) in
 268                   advance();
 269                   pair
 270                 end
 271                 else begin
 272                   advance();
 273                   scan i'
 274                 end
 275           | (' '|'\t'|'\r') ->
 276               (* Ignore whitespace by default: *)
 277               target.scanner_pos <- target.scanner_pos + 1;
 278               scan (i+1)
 279           | '\n' ->
 280               (* Ignore whitespace by default: *)
 281               target.scanner_pos     <- target.scanner_pos + 1;
 282               target.scanner_line    <- target.scanner_line + 1;
 283               target.scanner_linepos <- target.scanner_pos;
 284               scan (i+1)
 285           | ('\000'..'\031'|'\127'..'\255') ->
 286               let pair = mk_pair (Control c) 1 in
 287               target.scanner_pos <- target.scanner_pos + 1;
 288               pair
 289           | '[' ->
 290               (* Domain literal: *)
 291               scan_dliteral (i+1) (i+1) 0
 292           | _ ->
 293               scan_atom i i
 294     end
 295     else
 296       mk_pair End 0
 297
 298   and scan_atom i0 i =
 299     let return_atom() =
 300       let astring = String.sub s i0 (i-i0) in
 301       let r =
 302         if spec.opt_recognize_encoded_words then
 303           Str.string_match ~groups:4 encoded_word_re astring 0
 304         else
 305           None
 306       in
 307       match r with
 308           None ->
 309             (* An atom contains never a linefeed character, so we can ignore
 310              * scanner_line here.
 311              *)
 312             let pair = mk_pair (Atom astring) (i-i0) in
 313             target.scanner_pos <- i;
 314             pair
 315         | Some mr ->
 316             (* Found an encoded word. *)
 317             let charset  = Str.matched_group mr 1 astring in
 318             let encoding = Str.matched_group mr 2 astring in
 319             let content  = Str.matched_group mr 3 astring in
 320             let t = EncodedWord(String.uppercase charset,
 321                                  String.uppercase encoding,
 322                                  content) in
 323             let pair = mk_pair t (i-i0) in
 324             target.scanner_pos <- i;
 325             pair
 326     in
 327
 328     if i < l then
 329       let c = s.[i] in
 330       match c with
 331           ('\000'..'\031'|'\127'..'\255'|'"'|'('|'['|' '|'\t'|'\r'|'\n') ->
 332             return_atom()
 333         | _ ->
 334             if spec.is_special.( Char.code c ) then
 335               return_atom()
 336             else
 337               scan_atom i0 (i+1)
 338     else
 339       return_atom()
 340
 341   and scan_qstring i0 i n =
 342     if i < l then
 343       let c = s.[i] in
 344       match c with
 345           '"' ->
 346             (* Regular end of the quoted string: *)
 347             let content, line, linepos = copy_qstring i0 (i-1) n in
 348             let pair = mk_pair (QString content) (i-i0+2) in
 349             target.scanner_pos <- i+1;
 350             target.scanner_line <- line;
 351             target.scanner_linepos <- linepos;
 352             pair
 353         | '\\' when not spec.opt_no_backslash_escaping ->
 354             scan_qstring i0 (i+2) (n+1)
 355         | _ ->
 356             scan_qstring i0 (i+1) (n+1)
 357     else
 358       (* Missing right double quote *)
 359       let content, line, linepos = copy_qstring i0 (l-1) n in
 360       let pair = mk_pair (QString content) (l-i0+1) in
 361       target.scanner_pos <- l;
 362       target.scanner_line <- line;
 363       target.scanner_linepos <- linepos;
 364       pair
 365
 366   and copy_qstring i0 i1 n =
 367     (* Used for quoted strings and for domain literals *)
 368     let r = String.create n in
 369     let k = ref 0 in
 370     let line = ref target.scanner_line in
 371     let linepos = ref target.scanner_linepos in
 372     for i = i0 to i1 do
 373       let c = s.[i] in
 374       match c with
 375           '\\' when i < i1 &&  not spec.opt_no_backslash_escaping -> ()
 376         | '\n' ->
 377             line := !line + 1;
 378             linepos := i+1;
 379             r.[ !k ] <- c;
 380             incr k
 381         | _ ->
 382             r.[ !k ] <- c;
 383             incr k
 384     done;
 385     assert (!k = n);
 386     r, !line, !linepos
 387
 388   and scan_dliteral i0 i n =
 389     if i < l then
 390       let c = s.[i] in
 391       match c with
 392           ']' ->
 393             (* Regular end of the domain literal: *)
 394             let content, line, linepos = copy_qstring i0 (i-1) n in
 395             let pair = mk_pair (DomainLiteral content) (i-i0+2) in
 396             target.scanner_pos <- i+1;
 397             target.scanner_line <- line;
 398             target.scanner_linepos <- linepos;
 399             pair
 400         | '\\' when not spec.opt_no_backslash_escaping ->
 401             scan_dliteral i0 (i+2) (n+1)
 402         | _ ->
 403             (* Note: '[' is not allowed by RFC 822; we treat it here as
 404              * a regular character (questionable)
 405              *)
 406             scan_dliteral i0 (i+1) (n+1)
 407     else
 408       (* Missing right bracket *)
 409       let content, line, linepos = copy_qstring i0 (l-1) n in
 410       let pair = mk_pair (DomainLiteral content) (l-i0+1) in
 411       target.scanner_pos <- l;
 412       target.scanner_line <- line;
 413       target.scanner_linepos <- linepos;
 414       pair
 415
 416
 417   and scan_comment i level line linepos =
 418     if i < l then
 419       let c = s.[i] in
 420       match c with
 421           ')' ->
 422             (i+1), line, linepos
 423         | '(' ->
 424             (* nested comment *)
 425             let i', line', linepos' =
 426               scan_comment (i+1) (level+1) line linepos
 427             in
 428             scan_comment i' level line' linepos'
 429         | '\\' when not spec.opt_no_backslash_escaping ->
 430             if (i+1) < l && s.[i+1] = '\n' then
 431               scan_comment (i+2) level (line+1) (i+2)
 432             else
 433               scan_comment (i+2) level line linepos
 434         | '\n' ->
 435             scan_comment (i+1) level (line+1) (i+1)
 436         | _ ->
 437             scan_comment (i+1) level line linepos
 438     else
 439       (* Missing closing ')' *)
 440       i, line, linepos
 441   in
 442
 443   scan target.scanner_pos
 444 ;;
 445
 446
 447 let scan_token ((spec,target) as scn) =
 448   (* This function handles token queueing in order to recognize white space
 449    * that separates adjacent encoded words.
 450    *)
 451
 452   let rec collect_whitespace () =
 453     (* Scans whitespace tokens and returns them as:
 454      * (ws_list, other_tok)     if there is some other_tok following the
 455      *                          list (other_tok = End is possible)
 456      *)
 457     let (et, t) as pair = scan_next_token scn in
 458     ( match t with
 459           (Special ' '|Special '\t'|Special '\n'|Special '\r') ->
 460             let ws_list, tok = collect_whitespace() in
 461             pair :: ws_list, tok
 462         | _ ->
 463             [], pair
 464     )
 465   in
 466
 467   try
 468     (* Is there an already scanned token in the queue? *)
 469     let et = Queue.take target.scanned_tokens in
 470     let t = et.token in
 471     target.last_token <- t;
 472     et, et.token
 473   with
 474       Queue.Empty ->
 475         (* If not: inspect the last token. If that token is an EncodedWord,
 476          * the next tokens are scanned in advance to determine if there
 477          * are spaces separating two EncodedWords. These tokens are put
 478          * into the queue such that it is avoided that they are scanned
 479          * twice. (The sole purpose of the queue.)
 480          *)
 481         match target.last_token with
 482             EncodedWord(_,_,_) as ew ->
 483               let ws_list, tok = collect_whitespace() in
 484               (* If tok is an EncodedWord, too, the tokens in ws_list must
 485                * be flagged as separating two adjacent encoded words.
 486                *)
 487               ( match tok with
 488                     _, EncodedWord(_,_,_) ->
 489                       List.iter
 490                         (fun (et,t) ->
 491                            et.token_sep <- true)
 492                         ws_list
 493                   | _ ->
 494                       ()
 495               );
 496               (* Anyway, queue the read tokens but the first up *)
 497               ( match ws_list with
 498                     [] ->
 499                       (* Nothing to queue *)
 500                       let et, t = tok in
 501                       target.last_token <- t;
 502                       tok
 503                   | (et,t) as pair :: ws_list' ->
 504                       List.iter
 505                         (fun (et',_) ->
 506                            Queue.add et' target.scanned_tokens)
 507                         ws_list';
 508                       ( match tok with
 509                           | _, End ->
 510                               ()
 511                           | (et',_) ->
 512                               Queue.add et' target.scanned_tokens
 513                       );
 514                       (* Return the first scanned token *)
 515                       target.last_token <- t;
 516                       pair
 517               )
 518           | _ ->
 519               (* Regular case: Scan one token; do not queue it up *)
 520               let (et, t) as pair = scan_next_token scn in
 521               target.last_token <- t;
 522               pair
 523 ;;
 524
 525
 526 let scan_token_list scn =
 527   let rec collect() =
 528     match scan_token scn with
 529         _, End ->
 530           []
 531       | pair ->
 532           pair :: collect()
 533   in
 534   collect()
 535 ;;
 536
 537
 538 let scan_structured_value s specials options =
 539   let rec collect scn =
 540     match scan_token scn with
 541         _, End ->
 542           []
 543       | _, t ->
 544           t :: collect scn
 545   in
 546   let scn = create_mime_scanner specials options s in
 547   collect scn
 548 ;;
 549
 550
 551 let specials_rfc822 =
 552   [ '<'; '>'; '@'; ','; ';'; ':'; '\\'; '.' ];;
 553
 554
 555 let specials_rfc2045 =
 556   [ '<'; '>'; '@'; ','; ';'; ':'; '\\'; '/' ];;
 557
 558
 559 let scan_encoded_text_value s =
 560   let specials = [ ' '; '\t'; '\r'; '\n'; '('; '['; '"' ] in
 561   let options =  [ Recognize_encoded_words ] in
 562   let scn = create_mime_scanner specials options s in
 563
 564   let rec collect () =
 565     match scan_token scn with
 566         _, End ->
 567           []
 568       | et, _ when separates_adjacent_encoded_words et ->
 569           collect()
 570       | et, (Special _|Atom _|EncodedWord(_,_,_)) ->
 571           et :: collect ()
 572       | _, _ ->
 573           assert false
 574   in
 575   collect()
 576 ;;
 577
 578
 579 let scan_value_with_parameters s options =
 580   let rec parse_params tl =
 581     match tl with
 582         Atom n :: Special '=' :: Atom v :: tl' ->
 583           (n,v) :: parse_rest tl'
 584       | Atom n :: Special '=' :: QString v :: tl' ->
 585           (n,v) :: parse_rest tl'
 586       | _ ->
 587           failwith "Mimestring.scan_value_with_parameters"
 588   and parse_rest tl =
 589     match tl with
 590         [] -> []
 591       | Special ';' :: tl' ->
 592           parse_params tl'
 593       | _ ->
 594           failwith "Mimestring.scan_value_with_parameters"
 595   in
 596
 597   (* Note: Even if not used here, the comma is a very common separator
 598    * and should be recognized as being special. You will get a
 599    * failure if there is a comma in the scanned string.
 600    *)
 601   let tl = scan_structured_value s [ ';'; '='; ',' ] options in
 602   match tl with
 603       [ Atom n ] -> n, []
 604     | [ QString n ] -> n, []
 605     | Atom n :: Special ';' :: tl' ->
 606         n, parse_params tl'
 607     | QString n :: Special ';' :: tl' ->
 608         n, parse_params tl'
 609     | _ ->
 610         failwith "Mimestring.scan_value_with_parameters"
 611 ;;
 612
 613
 614 let scan_mime_type s options =
 615   let n, params = scan_value_with_parameters s options in
 616   (String.lowercase n),
 617   (List.map (fun (n,v) -> (String.lowercase n, v)) params)
 618 ;;
 619
 620
 621 let lf_re = Str.regexp "[\n]";;
 622
 623 let scan_multipart_body s ~start_pos:i0 ~end_pos:i1 ~boundary =
 624   let l_s = String.length s in
 625   if i0 < 0 or i1 < 0 or i0 > l_s or i1 >l_s then
 626     invalid_arg "Mimestring.scan_multipart_body";
 627
 628   (* First compile the regexps scanning for 'boundary': *)
 629   let boundary1_re =
 630     Str.regexp ("\n--" ^ Str.quote boundary) in
 631   let boundary2_re =
 632     Str.regexp ("--" ^ Str.quote boundary) in
 633
 634   let rec parse i =
 635     (* i: Beginning of the current part (position directly after the
 636      * boundary line
 637      *)
 638     (* Search for next boundary at position i *)
 639     let i' =
 640       try min (fst (Str.search_forward boundary1_re s i) + 1) i1
 641       with
 642           Not_found -> i1
 643     in
 644     (* i': Either the position of the first '-' of the boundary line,
 645      *     or i1 if no boundary has been found
 646      *)
 647     if i' >= i1 then
 648       [] (* Ignore everything after the last boundary *)
 649     else
 650       let i'' =
 651         try min (fst (Str.search_forward lf_re s i') + 1) i1
 652         with
 653             Not_found -> i1
 654       in
 655       (* i'': The position after the boundary line *)
 656 (*
 657       print_int i; print_newline();
 658       print_int i'; print_newline();
 659       print_int i''; print_newline();
 660       flush stdout;
 661 *)
 662       let header, k = scan_header s i i' in
 663       (* header: the header of the part
 664        * k: beginning of the body
 665        *)
 666
 667       let value =
 668         (* We know that i'-1 is a linefeed character. i'-2 should be a CR
 669          * character. Both characters are not part of the value.
 670          *)
 671         if i' >= 2 then
 672           match s.[i'-2] with
 673               '\013' -> String.sub s k (i'-2-k)
 674             | _      -> String.sub s k (i'-1-k)
 675         else
 676           String.sub s k (i'-1-k)
 677       in
 678
 679       let pair =
 680         (header, value) in
 681
 682       if i'' >= i1
 683       then
 684         [ pair ]
 685       else
 686         pair :: parse i''
 687   in
 688
 689   (* Find the first boundary. This is a special case, because it may be
 690    * right at the beginning of the string (no preceding CRLF)
 691    *)
 692
 693   let i_bnd =
 694     if Str.string_partial_match boundary2_re s i0 <> None then
 695       i0
 696     else
 697       try min (fst (Str.search_forward boundary1_re s i0)) i1
 698       with
 699           Not_found -> i1
 700   in
 701
 702   if i_bnd >= i1 then
 703     []
 704   else
 705     let i_bnd' =
 706       try min (fst (Str.search_forward lf_re s (i_bnd + 1)) + 1) i1
 707       with
 708           Not_found -> i1
 709     in
 710     if i_bnd' >= i1 then
 711       []
 712     else
 713       parse i_bnd'
 714 ;;
 715
 716
 717 let scan_multipart_body_and_decode s ~start_pos:i0 ~end_pos:i1 ~boundary =
 718   let parts = scan_multipart_body s i0 i1 boundary in
 719   List.map
 720     (fun (params, value) ->
 721        let encoding =
 722          try List.assoc "content-transfer-encoding" params
 723          with Not_found -> "7bit"
 724        in
 725
 726        (* NOTE: In the case of "base64" and "quoted-printable", the allocation
 727         * of the string "value" could be avoided.
 728         *)
 729
 730        let value' =
 731          match encoding with
 732              ("7bit"|"8bit"|"binary") -> value
 733            | "base64" ->
 734                Netencoding.Base64.decode_substring
 735                  value 0 (String.length value) false true
 736            | "quoted-printable" ->
 737                Netencoding.QuotedPrintable.decode_substring
 738                  value 0 (String.length value)
 739            | _ ->
 740                failwith "Mimestring.scan_multipart_body_and_decode: Unknown content-transfer-encoding"
 741        in
 742        (params, value')
 743     )
 744     parts
 745 ;;
 746
 747
 748 let scan_multipart_body_from_netstream s ~boundary ~create ~add ~stop =
 749
 750   (* The block size of s must be at least the length of the boundary + 3.
 751    * Otherwise it is not guaranteed that the boundary is always recognized.
 752    *)
 753   if Netstream.block_size s < String.length boundary + 3 then
 754     invalid_arg "Mimestring.scan_multipart_body_from_netstream";
 755
 756   (* First compile the regexps scanning for 'boundary': *)
 757   let boundary1_re =
 758     Str.regexp ("\n--" ^ Str.quote boundary) in
 759   let boundary2_re =
 760     Str.regexp ("--" ^ Str.quote boundary) in
 761
 762   (* Subtask 1: Search the end of the MIME header: CR LF CR LF
 763    *            (or LF LF). Enlarge the window until the complete header
 764    *            is covered by the window.
 765    *)
 766   let rec search_end_of_header k =
 767     (* Search the end of the header beginning at position k of the
 768      * current window.
 769      * Return the position of the first character of the body.
 770      *)
 771     try
 772       (* Search for LF CR? LF: *)
 773       let i, r = Str.search_forward
 774                    end_of_header_re
 775                    (Netbuffer.unsafe_buffer (Netstream.window s))
 776                    k
 777       in
 778       (* If match_end <= window_length, the search was successful.
 779        * Otherwise, we searched in the uninitialized region of the
 780        * buffer.
 781        *)
 782       if Str.match_end r <= Netstream.window_length s then
 783         Str.match_end r
 784       else
 785         raise Not_found
 786     with
 787         Not_found ->
 788           (* If the end of the stream is reached, the end of the header
 789            * is missing: Error.
 790            * Otherwise, we try to read another block, and continue.
 791            *)
 792           if Netstream.at_eos s then
 793             failwith "Mimestring.scan_multipart_body_from_netstream: Unexpected end of stream";
 794           let w0 = Netstream.window_length s in
 795           Netstream.want_another_block s;
 796           search_end_of_header (max (w0 - 2) 0)
 797   in
 798
 799   (* Subtask 2: Search the first boundary line. *)
 800   let rec search_first_boundary() =
 801     (* Search boundary per regexp; return the position of the character
 802      * immediately following the boundary (on the same line), or
 803      * raise Not_found.
 804      *)
 805     try
 806       (* Search boundary per regexp: *)
 807       let i, r = Str.search_forward
 808                    boundary1_re
 809                    (Netbuffer.unsafe_buffer (Netstream.window s))
 810                    0
 811       in
 812       (* If match_end <= window_length, the search was successful.
 813        * Otherwise, we searched in the uninitialized region of the
 814        * buffer.
 815        *)
 816       if Str.match_end r <= Netstream.window_length s then begin
 817         Str.match_end r
 818       end
 819       else raise Not_found
 820     with
 821         Not_found ->
 822           if Netstream.at_eos s then raise Not_found;
 823           (* The regexp did not match: Move the window by one block.
 824            *)
 825           let n =
 826             min
 827               (Netstream.window_length s)
 828               (Netstream.block_size s)
 829           in
 830           Netstream.move s n;
 831           search_first_boundary()
 832   in
 833
 834   (* Subtask 3: Search the next boundary line. Invoke 'add' for every
 835    * read chunk
 836    *)
 837   let rec search_next_boundary p =
 838     (* Returns the position directly after the boundary on the same line *)
 839     try
 840       (* Search boundary per regexp: *)
 841       let i,r = Str.search_forward
 842                   boundary1_re
 843                   (Netbuffer.unsafe_buffer (Netstream.window s))
 844                   0
 845       in
 846       (* If match_end <= window_length, the search was successful.
 847        * Otherwise, we searched in the uninitialized region of the
 848        * buffer.
 849        *)
 850       if Str.match_end r <= Netstream.window_length s then begin
 851         (* Add the last chunk of the part. *)
 852         let n =
 853           (* i is a LF. i - 1 should be CR. Ignore these characters. *)
 854           if i >= 1 then
 855             match (Netbuffer.unsafe_buffer (Netstream.window s)).[ i - 1 ] with
 856                 '\013' -> i - 1
 857               | _      -> i
 858           else
 859             i
 860         in
 861         (* Printf.printf "add n=%d\n" n; *)
 862         add p s 0 n;
 863         Str.match_end r
 864       end
 865       else raise Not_found
 866     with
 867         Not_found ->
 868           if Netstream.at_eos s then
 869             failwith "Mimestring.scan_multipart_body_from_netstream: next MIME boundary not found";
 870           (* The regexp did not match: Add the first block of the window;
 871            * and move the window.
 872            *)
 873           let n =
 874             min
 875               (Netstream.window_length s)
 876               (Netstream.block_size s)
 877           in
 878           (* Printf.printf "add n=%d\n" n; *)
 879           add p s 0 n;
 880           Netstream.move s n;
 881           search_next_boundary p
 882   in
 883
 884   (* Subtask 4: Search the end of the boundary line *)
 885   let rec search_end_of_line k =
 886     (* Search LF beginning at position k. Discard any contents until that. *)
 887     try
 888       let i,r = Str.search_forward
 889                   lf_re
 890                   (Netbuffer.unsafe_buffer (Netstream.window s))
 891                   k
 892       in
 893       (* If match_end <= window_length, the search was successful.
 894        * Otherwise, we searched in the uninitialized region of the
 895        * buffer.
 896        *)
 897       if Str.match_end r <= Netstream.window_length s then begin
 898          Str.match_end r
 899       end
 900       else raise Not_found
 901     with
 902         Not_found ->
 903           if Netstream.at_eos s then
 904             failwith "Mimestring.scan_multipart_body_from_netstream: MIME boundary without line end";
 905           (* The regexp did not match: move the window.
 906            *)
 907           let n = Netstream.window_length s in
 908           Netstream.move s n;
 909           search_end_of_line 0
 910   in
 911
 912   (* Subtask 5: Check whether "--" follows the boundary on the same line *)
 913   let check_whether_last_boundary k =
 914     (* k: The position directly after the boundary. *)
 915     Netstream.want s (k+2);
 916     let str = Netbuffer.unsafe_buffer (Netstream.window s) in
 917     (Netstream.window_length s >= k+2) && str.[k] = '-' && str.[k+1] = '-'
 918   in
 919
 920   (* Subtask 6: Check whether the buffer begins with a boundary. *)
 921   let check_beginning_is_boundary () =
 922     let m = String.length boundary + 2 in
 923     Netstream.want s m;
 924     let str = Netbuffer.unsafe_buffer (Netstream.window s) in
 925     (Netstream.window_length s >= m) &&
 926     (Str.string_partial_match boundary2_re str 0 <> None)
 927   in
 928
 929   let rec parse_part () =
 930     (* The first byte of the current window of s contains the character
 931      * directly following the boundary line that starts this part.
 932      *)
 933     (* Search the end of the MIME header: *)
 934     let k_eoh = search_end_of_header 0 in
 935     (* Printf.printf "k_eoh=%d\n" k_eoh; *)
 936     (* Get the MIME header: *)
 937     let str = Netbuffer.unsafe_buffer (Netstream.window s) in
 938     let header, k_eoh' = scan_header str 0 k_eoh in
 939     assert (k_eoh = k_eoh');
 940     (* Move the window over the header: *)
 941     Netstream.move s k_eoh;
 942     (* Create the part: *)
 943     let p = create header in
 944     let continue =
 945       begin try
 946         (* Search the next boundary; add the chunks while searching: *)
 947         let k_eob = search_next_boundary p in
 948         (* Printf.printf "k_eob=%d\n" k_eob; *)
 949         (* Is this the last boundary? *)
 950         if check_whether_last_boundary k_eob then begin
 951           (* Skip the rest: *)
 952           while not (Netstream.at_eos s) do
 953             Netstream.move s (Netstream.window_length s)
 954           done;
 955           Netstream.move s (Netstream.window_length s);
 956           false
 957         end
 958         else begin
 959           (* Move to the beginning of the next line: *)
 960           let k_eol = search_end_of_line k_eob in
 961           Netstream.move s k_eol;
 962           true
 963         end
 964       with
 965           any ->
 966             (try stop p with _ -> ());
 967             raise any
 968       end in
 969       stop p;
 970       if continue then
 971         (* Continue with next part: *)
 972         parse_part()
 973   in
 974
 975   (* Check whether s directly begins with a boundary: *)
 976   if check_beginning_is_boundary() then begin
 977     (* Move to the beginning of the next line: *)
 978     let k_eol = search_end_of_line 0 in
 979     Netstream.move s k_eol;
 980     (* Begin with first part: *)
 981     parse_part()
 982   end
 983   else begin
 984     (* Search the first boundary: *)
 985     try
 986       let k_eob = search_first_boundary() in
 987       (* Printf.printf "k_eob=%d\n" k_eob; *)
 988       (* Move to the beginning of the next line: *)
 989       let k_eol = search_end_of_line k_eob in
 990       (* Printf.printf "k_eol=%d\n" k_eol; *)
 991       Netstream.move s k_eol;
 992       (* Begin with first part: *)
 993       parse_part()
 994     with
 995         Not_found ->
 996           (* No boundary at all: The body is empty. *)
 997           ()
 998   end;
 999 ;;
1000
1001
1002 (* ======================================================================
1003  * History:
1004  *
1005  * $Log$
1006  * Revision 1.1  2000/11/17 09:57:27  lpadovan
1007  * Initial revision
1008  *
1009  * Revision 1.8  2000/08/13 00:04:36  gerd
1010  *      Encoded_word -> EncodedWord
1011  *      Bugfixes.
1012  *
1013  * Revision 1.7  2000/08/07 00:25:14  gerd
1014  *      Implemented the new functions for structured field lexing.
1015  *
1016  * Revision 1.6  2000/06/25 22:34:43  gerd
1017  *      Added labels to arguments.
1018  *
1019  * Revision 1.5  2000/06/25 21:15:48  gerd
1020  *      Checked thread-safety.
1021  *
1022  * Revision 1.4  2000/05/16 22:30:14  gerd
1023  *      Added support for some types of malformed MIME messages.
1024  *
1025  * Revision 1.3  2000/04/15 13:09:01  gerd
1026  *      Implemented uploads to temporary files.
1027  *
1028  * Revision 1.2  2000/03/02 01:15:30  gerd
1029  *      Updated.
1030  *
1031  * Revision 1.1  2000/02/25 15:21:12  gerd
1032  *      Initial revision.
1033  *
1034  *
1035  *)