# Copyright (c) 2002 bivio Software, Inc. All Rights Reserved. # # Visit http://www.bivio.biz for more info. # # This library is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation; either version 2.1 of the # License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; If not, you may get a copy from: # http://www.opensource.org/licenses/lgpl-license.html # # $Id: Cleaner.pm,v 2.7 2010/04/13 23:26:54 moeller Exp $ package Bivio::Test::HTMLParser::Cleaner; use strict; use Bivio::Base 'Test.HTMLParser'; our($VERSION) = sprintf('%d.%02d', q$Revision: 2.7 $ =~ /\d+/g); __PACKAGE__->register; sub internal_new { return shift->new(@_); } sub new { my($proto, $parser) = @_; my($html) = $parser->get('html'); $html =~ s/ / /g; $html =~ s/<\/?(?:br|p) ?\/?>/\n/ig; return $proto->SUPER::new({ html => $html, })->set_read_only; } sub unescape_text { my($self, $text) = @_; return '' unless defined($text); $text =~ s/\&\#39\;/'/g; $text =~ s/\"\;/"/g; $text =~ s/\&\#\d+\;/ /g; $text = Bivio::HTML->unescape($text); return $text; } sub text { my($self, $text) = @_; return '' unless defined($text); $text = $self->unescape_text($text); $text =~ s/\s+/ /g; $text =~ s/^\s+|\s+$//g; return $text; } 1;