Bivio::Agent::HTTP::Form
# Copyright (c) 1999-2010 bivio Software, Inc. All rights reserved. # $Id$ package Bivio::Agent::HTTP::Form; use strict; use Bivio::Base 'Bivio.UNIVERSAL'; b_use('IO.Trace'); # C<Bivio::Agent::HTTP::Form> parses an incoming form. # The request must have a I<form_model> attribute. Handles both # C<application/x-www-form-urlencoded> and C<multipart/form-data> # (RFC 1867). # # A form is a hash_ref. The name of the field is the key. The # value is either a scalar or a hash_ref. A string is returned # in the "normal" case, i.e. non-file fields. A hash_ref is returned # in the file field case or with forms which contain file fields # (see FormModel::_parse_cols for handling). This is tightly coupled with # L<Bivio::Type::FileField|Bivio::Type::FileField>. The hash_ref # contains the attributes: name, content_type, filename, and content. our($_TRACE); # Taken from RFC1521. NOT the same as 822_ATOM, btw. my($_TOKEN) = '([^][()<>@,;:\\\\"/?=\\000-\\040\\177-\\377]+)'; # This is the same as Mail::RFC822::QUOTED_STRING, except # we parse out the surrounding quotes. #my($_QUOTED_STRING) = '"((?:(?:\\\\{2})+|\\\\[^\\\\]|[^\\\\"])*)"'; my($_TOO_LONG) = b_use('Bivio.TypeError')->TOO_LONG; my($_FORM_DATA_MULTIPART_MIXED) = b_use('Bivio.TypeError')->FORM_DATA_MULTIPART_MIXED; my($_HTML) = b_use('Bivio.HTML'); my($_JSON) = b_use('MIME.JSON'); sub parse { my(undef, $req, $options) = @_; my($r) = $req->get('r'); my($m) = lc($r->method); unless ($m eq 'post') { return undef unless $m eq 'get'; my($q) = $req->unsafe_get('query'); return undef unless $q && $q->{$req->FORM_IN_QUERY_FLAG}; $req->put(query => {}); return $q; } my($ct) = lc($r->header_in('content-type') || ''); $ct =~ s/;.*//; $ct =~ s/\s//g; foreach my $x ( [qr{^\s*application/x-www-form-urlencoded}, \&_parse_url], [qr{^\s*multipart/form-data}, \&_parse_multipart], [qr{^\s*application/json}, \&_parse_json], ) { _trace('content-type=', $ct) if $_TRACE; next unless $ct =~ $x->[0]; my($res) = $x->[1]->($req, $r, $options); $res->{b_use('Biz.FormModel')->CONTENT_TYPE_FIELD} = $ct; return $res; } b_warn($ct, ': unknown Content-Type'); return undef; } sub _err { my($req, $msg, $entity) = @_; $req->throw_die(CORRUPT_FORM => { message => $msg, entity => $entity, }); } sub _parse_json { my($req) = @_; $req->put_req_is_json; return $_JSON->from_text($req->get_content); } sub _parse_multipart { my($req, $r, $options) = @_; # Returns the parsed multipart/form-data. See RFC1867 for a spec. my($max_field_size) = ($options || {})->{max_field_size} || ($req->unsafe_get('form_model') || b_use('Biz.FormModel')) ->MAX_FIELD_SIZE; my($buf) = $req->get_content; # We destroy content so we have to clear it here. $req->delete('content'); _trace('length=', length($$buf)) if $_TRACE; _err($req, 'no starting boundary line', undef) unless $$buf =~ s/^(?:.*?\r\n)*?(--.*?)\r\n//s; my($boundary) = "\r\n$1"; _trace('boundary=', $boundary) if $_TRACE; my($form) = {}; while (1) { my($field) = _parse_multipart_headers($buf, $req); _err($req, 'missing form boundary: ' . $boundary, $buf) unless (my $i = index($$buf, $boundary)) >= 0; my($content) = substr($$buf, 0, $i); substr($$buf, 0, $i + length($boundary)) = ''; my($value) = keys(%$field) > 1 ? { %$field, $field->{error} ? () : (content => \$content), } : length($content) > $max_field_size ? { %$field, error => $_TOO_LONG } : $content; my($name) = $field->{name}; if (defined($form->{$name})) { $form->{$name} = [$form->{$name}] unless ref($form->{$name}) eq 'ARRAY'; push(@{$form->{$name}}, $value); } else { $form->{$name} = $value; } next if $$buf =~ s/^\r\n//s; last if $$buf =~ s/^--//s; _err($req, 'invalid encapsulation or closing boundary', $buf); } return $form; } sub _parse_multipart_headers { my($buf, $req) = @_; $req->throw_die(CORRUPT_FORM => { message => 'missing header separator', entity => $buf, }) unless $$buf =~ s/^(.*?)\r\n\r\n//s; my($headers) = $1; $headers =~ s/\r\n\s/ /sg; my($field) = {}; foreach my $header (split(/\r\n/, $headers)) { my($key, $value) = split(/:\s*/s, $header, 2); $key = lc($key); _trace($key, ': ', $value) if $_TRACE; if ($key eq 'content-type') { # LIMITATION: We don't handle multipart/mixed. Browsers may use # this to send multiple files for a single field. if ($value =~ /multipart\/mixed/i) { $field->{error} = $_FORM_DATA_MULTIPART_MIXED; next; } $field->{content_type} = $value; } elsif ($key eq 'content-disposition') { _err($req, 'invalid content-disposition', $value) unless $value =~ s/^form-data\s*//s; while ($value =~ s/^;\s*$_TOKEN\s*=\s*//os) { my($attr) = lc($1); # According to RFC822 all quotes and backslashes must be # escaped (quoted-pair) and other characters can be to. # The following handles what IE and NS do: they don't # escape, so values come through like filename="y".txt" # (the quote after the y should be escaped). We only # unescape backslash and quote, because the browsers pass # \ without escaping and therefore we can't just do # s/\\(.)/$1/g; _err($req, $attr . ': invalid content-disposition attribute syntax', $value) unless $value =~ s/^\"(.*?)\"\s*;\s*/;/s || $value =~ s/^\"(.*)\"\s*$//s || $value =~ s/^$_TOKEN\s*//os; my($v) = $1; $v =~ s/\\\\/\\/g; $v =~ s/\\\"/\"/g; $field->{$attr} = $v; } } elsif ($key eq 'content-transfer-encoding') { # Really shouldn't get here, but just in case, so we # don't corrupt user data. _err($req, 'invalid encoding must be 8bit or binary', $value) unless $value =~ /^(?:8bit|binary)\b/i; } elsif ($key ne 'content-length') { b_warn($key, ': unexpected header field; headers=', $headers); } } _err($req, 'field missing "name" attribute', $field) unless defined($field->{name}); return $field; } sub _parse_url { my($req) = @_; return $_HTML->parse_www_form_urlencoded(${$req->get_content}); } 1;