# Copyright (c) 1999-2010 bivio Software, Inc. All rights reserved. # # Visit http://www.bivio.biz for more info. # # This library is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation; either version 2.1 of the # License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; If not, you may get a copy from: # http://www.opensource.org/licenses/lgpl-license.html # # $Id: String.pm,v 2.23 2011/12/06 02:54:48 nagler Exp $ package Bivio::Type::String; use strict; use Bivio::Base 'Bivio.Type'; use Text::Tabs (); # char => [utf-8, windows-1252] my($_TRANSLITERATE) = { '-' => [qr{[\x{0096}\x{2010}-\x{2013}]}, qr{\x96}], '--' => [qr{[\x{2014}-\x{2015}]}, qr{\x97}], "'" => [qr{[\x{2018}-\x{201B}\x{2BC}\x{2032}]}, qr{[\x91\x92\xb4]}], '"' => [qr{[\x{201C}-\x{201F}\x{2033}]}, qr{[\x93\x94]}], '...' => [qr{\x{2026}}, qr{\x85}], '*' => [qr{[\x{2022}\x{20B7}]}, qr{[\x95\xB7]}], '(TM)' => [qr{\x{2122}}, qr{\x99}], ' ' => [qr{\x{00A0}}, qr{\xA0}], '(C)' => [qr{\x{00A9}}, qr{\xA9}], '<<' => [qr{\x{00AB}}, qr{\xAB}], '>>' => [qr{\x{00BB}}, qr{\xBB}], '<-' => [qr{\x{2190}}], '->' => [qr{\x{2192}}], '(R)' => [qr{\x{00AE}}, qr{\xAE}], '+/-' => [qr{\x{00B1}}, qr{\xB1}], '1/4' => [qr{\x{00BC}}, qr{\xBC}], '1/2' => [qr{\x{00BD}}, qr{\xBD}], '3/4' => [qr{\x{00BE}}, qr{\xBE}], ' ' => [qr{[\x{00A0}\x{2028}]}, qr{\xA0}], '' => [qr{[\x{00AD}\x{200B}]}, qr{\xAD}], #TODO: remove these when we support unicode 'A' => [qr{[\x{00C0}-\x{00C5}]}, qr{[\xC0-\xC5]}], 'AE' => [qr{\x00C6}, qr{[\xC6]}], 'C' => [qw{\x00C7}, qr{[\xC7]}], 'E' => [qr{[\x{00C8}-\x{00CB}]}, qr{[\xC8-\xCB]}], 'I' => [qr{[\x{00CC}-\x{00CF}]}, qr{[\xCC-\xCF]}], 'N' => [qr{[\x{00D1}]}, qr{[\xD1]}], 'O' => [qr{[\x{00D2}-\x{00D6}]}, qr{[\xD2-\xD6]}], 'U' => [qr{[\x{00D9}-\x{00DC}]}, qr{[\xD9-\xDC]}], 'Y' => [qr{\x{00DD}}, qr{\xDD}], 'a' => [qr{[\x{00E0}-\x{00E5}]}, qr{[\xE0-\xE5]}], 'ae' => [qr{\x00E6}, qr{\xE6}], 'c' => [qr{\x00E7}, qr{\xE7}], 'e' => [qr{[\x{00E8}-\x{00EB}]}, qr{[\xE8-\xEB]}], 'fi' => [qr{[\x{fb01}]}], 'fl' => [qr{[\x{fb02}]}], 'i' => [qr{[\x{00EC}-\x{00EF}]}, qr{[\xEC-\xEF]}], 'n' => [qr{[\x{00F1}]}, qr{[\xF1]}], 'o' => [qr{[\x{00F2}-\x{00F6}]}, qr{[\xF2-\xF6]}], 'u' => [qr{[\x{00F9}-\x{00FC}]}, qr{[\xF9-\xFC]}], 'y' => [qr{\x{00FD}}, qr{\xFD}], }; our($VERSION) = sprintf('%d.%02d', q$Revision: 2.23 $ =~ /\d+/g); sub canonicalize_and_excerpt { my($proto, $value, $max_words, $no_ellipsis) = @_; my($v, $return) = _ref($value); return $v if $return; # So we are re-entrant. If there was an ellipsis in the actual text, so be it. $$v =~ s/\s+\.{3}$//; $max_words ||= 45; #TODO: Split on paragraphs first. Google groups seems to do this my($words) = [grep( length($_), split( ' ', ${$proto->canonicalize_charset( $proto->canonicalize_newlines($v), )}, $max_words + 1, ), )]; if (@$words > $max_words) { pop(@$words); push(@$words, '...') unless $no_ellipsis; } return \(join(' ', @$words)); } sub canonicalize_charset { my(undef, $value) = @_; my($v, $return) = _ref($value); return $v if $return; return _clean_whitespace(_clean_utf8($v) || _clean_1252($v) || $v); } sub canonicalize_newlines { my(undef, $value) = @_; my($v, $return) = _ref($value); return $v if $return; $$v =~ s/\r\n|\r/\n/sg; $$v =~ s/^[ \t]+$//mg; $$v =~ s/\n+$//sg; $$v .= "\n" if length($$v); return $v; } sub clean_and_trim { my($proto, $value) = @_; my($v, $return) = _ref($value); return $v if $return; $value .= $value while length($value) < $proto->get_min_width; return substr($value, 0, $proto->get_width); } sub compare { my($proto, $left, $right) = @_; return $proto->compare_defined( defined($left) ? $left : '', defined($right) ? $right : '', ); } sub from_literal { my($proto, $value) = @_; $proto->internal_from_literal_warning unless wantarray; return (undef, undef) unless defined($value) && length($value); if (my $mw = $proto->get_min_width) { return (undef, Bivio::TypeError->TOO_SHORT) if length($value) < $mw; } return (undef, Bivio::TypeError->TOO_LONG) if length($value) > $proto->get_width; return $value; } sub get_min_width { return 0; } sub get_width { return 0x7fffffff; } sub to_camel_case { return _camel_case($_[1], ' '); } sub to_camel_case_identifier { return _camel_case($_[1], ''); } sub wrap_lines { my($proto, $value, $width) = @_; $width = 72 unless $width; my(@lines) = (split /\n/, ref($value) ? $$value : $value); @lines = Text::Tabs::expand(@lines); my($formatted) = []; my($indent) = 0; foreach my $line (@lines) { $line =~ s/\s+$//; while (defined($line) && length($line) > $width) { _wrap_line($formatted, \$line, $indent, $width); } push(@$formatted, $line) if defined($line); } return join("\n", @$formatted, ''); } sub _camel_case { my($value, $sep) = @_; return !$value ? $value : join($sep, map(ucfirst(lc($_)), split(/[\W_]+/, $value))); } sub _clean_1252 { # See http://en.wikipedia.org/wiki/WINDOWS-1252 my($value) = @_; my($res) = _map_characters($value, 1); return undef unless $res; $$value =~ s/\x0D?\x0A?$//g; $$value =~ s/[\x00-\x09\x0B-\x1F\x7F\x81]//g; $$value =~ s/[\xB0\xB7]//g; $$value =~ s/[\xDE]//g; return $value; } sub _clean_utf8 { my($value) = @_; return undef unless utf8::valid($$value); utf8::decode($$value); my($res) = _map_characters($value, 0); return undef unless $res; utf8::encode($$value); return $value; } sub _clean_whitespace { my($value) = @_; $$value =~ s/\t/ /sg; $$value =~ s/\r\n/\n/sg; $$value =~ s/\r/\n/sg; $$value =~ s/ +$//mg; $$value =~ s/^\s*|\s*$//sg; return $value; } sub _map_characters { my($value, $map) = @_; my($match) = 0; while (my($to, $from) = each(%$_TRANSLITERATE)) { my($regexp) = $from->[$map]; next unless $regexp; $match = 1 if $$value =~ s/$regexp/$to/g; } return $match ? $value : undef; } sub _ref { my($value) = @_; my($v) = ref($value) ? $value : \$value; return ($v, 0) if defined($$v) && length($$v); $$v = ''; return ($v, 1); } sub _wrap_line { my($formatted, $line, $indent, $width) = @_; $$line =~ /(^\s*(|[\-\*])\s+)/; $indent = defined($1) ? substr($1, 0, $width) : ''; my($white_pos) = rindex($$line, ' ', $width); $white_pos = index($$line, ' ', $width) if $white_pos < length($indent); # Line cannot be broken if no white-space found or quoted if ($white_pos == -1 || $$line =~ /^\s*[>]/) { push(@$formatted, $$line); undef($$line); } else { my($wrapped) = substr($$line, 0, $white_pos); push(@$formatted, $wrapped); $$line = substr($$line, $white_pos); $$line =~ s/^\s+/' ' x length($indent)/e; } return; } 1;