Bivio::Type::String
# Copyright (c) 1999-2010 bivio Software, Inc. All rights reserved.
# $Id$
package Bivio::Type::String;
use strict;
use Bivio::Base 'Bivio.Type';
use Text::Tabs ();
# char => [utf-8, windows-1252]
my($_TRANSLITERATE) = {
'-' => [qr{[\x{0096}\x{2010}-\x{2013}]}, qr{\x96}],
'--' => [qr{[\x{0097}\x{2014}-\x{2015}]}, qr{\x97}],
"'" => [qr{[\x{0091}\x{0092}\x{2018}-\x{201B}\x{2BC}\x{2032}]}, qr{[\x91\x92\xb4]}],
'"' => [qr{[\x{0093}\x{0094}\x{201C}-\x{201F}\x{2033}]}, qr{[\x93\x94]}],
'...' => [qr{[\x{0085}\x{2026}]}, qr{\x85}],
'*' => [qr{[\x{0095}\x{2022}\x{20B7}]}, qr{[\x95\xB7]}],
'(TM)' => [qr{[\x{2122}\x{0099}]}, qr{\x99}],
' ' => [qr{\x{00A0}}, qr{\xA0}],
'(C)' => [qr{\x{00A9}}, qr{\xA9}],
'<<' => [qr{\x{00AB}}, qr{\xAB}],
'>>' => [qr{\x{00BB}}, qr{\xBB}],
'<-' => [qr{\x{2190}}],
'->' => [qr{\x{2192}}],
'(R)' => [qr{\x{00AE}}, qr{\xAE}],
'+/-' => [qr{\x{00B1}}, qr{\xB1}],
'1/4' => [qr{\x{00BC}}, qr{\xBC}],
'1/2' => [qr{\x{00BD}}, qr{\xBD}],
'3/4' => [qr{\x{00BE}}, qr{\xBE}],
' ' => [qr{[\x{00A0}\x{2028}\x{2009}]}, qr{\xA0}],
'' => [qr{[\x{00AD}\x{200B}]}, qr{\xAD}],
#TODO: remove these when we support unicode
'A' => [qr{[\x{00C0}-\x{00C5}]}, qr{[\xC0-\xC5]}],
'AE' => [qr{\x00C6}, qr{[\xC6]}],
'C' => [qw{\x00C7}, qr{[\xC7]}],
'E' => [qr{[\x{00C8}-\x{00CB}]}, qr{[\xC8-\xCB]}],
'I' => [qr{[\x{00CC}-\x{00CF}]}, qr{[\xCC-\xCF]}],
'N' => [qr{[\x{00D1}]}, qr{[\xD1]}],
'O' => [qr{[\x{00D2}-\x{00D6}]}, qr{[\xD2-\xD6]}],
'U' => [qr{[\x{00D9}-\x{00DC}]}, qr{[\xD9-\xDC]}],
'Y' => [qr{\x{00DD}}, qr{\xDD}],
'a' => [qr{[\x{00E0}-\x{00E5}]}, qr{[\xE0-\xE5]}],
'ae' => [qr{\x00E6}, qr{\xE6}],
'c' => [qr{\x00E7}, qr{\xE7}],
'e' => [qr{[\x{00E8}-\x{00EB}]}, qr{[\xE8-\xEB]}],
'fi' => [qr{[\x{fb01}]}, qr{[\xDE]}],
'fl' => [qr{[\x{fb02}]}, qr{[\xDF]}],
'i' => [qr{[\x{00EC}-\x{00EF}]}, qr{[\xEC-\xEF]}],
'n' => [qr{[\x{00F1}]}, qr{[\xF1]}],
'o' => [qr{[\x{00F2}-\x{00F6}]}, qr{[\xF2-\xF6]}],
'u' => [qr{[\x{00F9}-\x{00FC}]}, qr{[\xF9-\xFC]}],
'y' => [qr{\x{00FD}}, qr{\xFD}],
};
sub canonicalize_and_excerpt {
my($proto, $value, $max_words, $no_ellipsis) = @_;
my($v, $return) = _ref($value);
return $v
if $return;
# So we are re-entrant. If there was an ellipsis in the actual text, so be it.
$$v =~ s/\s+\.{3}$//;
# remove repeating symbols, ex ---
$$v =~ s/[#\$\%&*+\-.:^_~<=>@~]{3,}/ /g;
$max_words ||= 45;
#TODO: Split on paragraphs first. Google groups seems to do this
my($words) = [grep(
length($_),
split(
' ',
${$proto->canonicalize_charset(
$proto->canonicalize_newlines($v),
)},
$max_words + 1,
),
)];
if (@$words > $max_words) {
pop(@$words);
push(@$words, '...')
unless $no_ellipsis;
}
return \(join(' ', @$words));
}
sub canonicalize_charset {
my(undef, $value) = @_;
my($v, $return) = _ref($value);
return $v
if $return;
return _clean_whitespace(_clean_utf8($v) || _clean_1252($v) || $v);
}
sub canonicalize_newlines {
my(undef, $value) = @_;
my($v, $return) = _ref($value);
return $v
if $return;
$$v =~ s/\r\n|\r/\n/sg;
$$v =~ s/^[ \t]+$//mg;
$$v =~ s/\n+$//sg;
$$v .= "\n"
if length($$v);
return $v;
}
sub clean_and_trim {
my($proto, $value) = @_;
my($v, $return) = _ref($value);
return $v
if $return;
$value .= $value
while length($value) < $proto->get_min_width;
return utf8::is_utf8($value)
? _trim_utf8($proto, $value)
: substr($value, 0, $proto->get_width);
}
sub compare {
my($proto, $left, $right) = @_;
return $proto->compare_defined(
defined($left) ? $left : '',
defined($right) ? $right : '',
);
}
sub from_literal {
my($proto, $value) = @_;
$proto->internal_from_literal_warning
unless wantarray;
return (undef, undef)
unless defined($value) && length($value);
if (my $mw = $proto->get_min_width) {
return (undef, Bivio::TypeError->TOO_SHORT)
if length($value) < $mw;
}
return (undef, Bivio::TypeError->TOO_LONG)
if length($value) > $proto->get_width;
return $value;
}
sub get_min_width {
return 0;
}
sub get_width {
return 0x7fffffff;
}
sub to_camel_case {
return _camel_case($_[1], ' ');
}
sub to_camel_case_identifier {
return _camel_case($_[1], '');
}
sub _camel_case {
my($value, $sep) = @_;
return !$value ? $value
: join($sep, map(ucfirst(lc($_)), split(/[\W_]+/, $value)));
}
sub _clean_1252 {
# See http://en.wikipedia.org/wiki/WINDOWS-1252
my($value) = @_;
my($res) = _map_characters($value, 1);
return undef unless $res;
$$value =~ s/\x0D?\x0A?$//g;
$$value =~ s/[\x00-\x09\x0B-\x1F\x7F\x81]//g;
$$value =~ s/[\xB0\xB7]//g;
$$value =~ s/[\xDE]//g;
return $value;
}
sub _clean_utf8 {
my($value) = @_;
return undef
unless utf8::valid($$value);
utf8::decode($$value);
my($res) = _map_characters($value, 0);
return undef
unless $res;
utf8::encode($$value);
return $value;
}
sub _clean_whitespace {
my($value) = @_;
$$value =~ s/\t/ /sg;
$$value =~ s/\r\n/\n/sg;
$$value =~ s/\r/\n/sg;
$$value =~ s/ +$//mg;
$$value =~ s/^\s*|\s*$//sg;
return $value;
}
sub _map_characters {
my($value, $map) = @_;
my($match) = 0;
while (my($to, $from) = each(%$_TRANSLITERATE)) {
my($regexp) = $from->[$map];
next unless $regexp;
$match = 1
if $$value =~ s/$regexp/$to/g;
}
return $match ? $value : undef;
}
sub _ref {
my($value) = @_;
my($v) = ref($value) ? $value : \$value;
return ($v, 0)
if defined($$v) && length($$v);
$$v = '';
return ($v, 1);
}
sub _size_in_bytes {
my($proto, $value) = @_;
use bytes;
return bytes::length($value);
}
sub _trim_utf8 {
my($proto, $value) = @_;
my($width) = $proto->get_width;
my($current) = $width;
while (_size_in_bytes($proto, $value) > $width) {
$value = substr($value, 0, --$current);
}
return $value;
}
1;