# Copyright (c) 2008 bivio Software, Inc. All Rights Reserved. # # Visit http://www.bivio.biz for more info. # # This library is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation; either version 2.1 of the # License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; If not, you may get a copy from: # http://www.opensource.org/licenses/lgpl-license.html # # $Id: TextHTML.pm,v 1.2 2008/06/17 02:06:43 nagler Exp $ package Bivio::Search::Parser::RealmFile::TextHTML; use strict; use Bivio::Base 'SearchParser.RealmFile'; our($VERSION) = sprintf('%d.%02d', q$Revision: 1.2 $ =~ /\d+/g); sub CONTENT_TYPE_LIST { return 'text/html'; } sub handle_realm_file_new_text { my($proto, $parseable) = @_; my($t) = $parseable->get_content; $$t =~ s{
]*>|
]*>\s*( ?)*
]*>/ PARAGRAPH_SPLIT_HERE /isg;
$title =~ s/^\s+|\s+$//gs
if defined($title);
$t = $proto->use('HTML.Scraper')->to_text($t);
$$t =~ s/\s+/ /sg;
$$t =~ s/ *\bPARAGRAPH_SPLIT_HERE\b */\n\n/sg;
return $proto->new({
type => 'text/html',
defined($title) && length($title) ? (title => $title) : (),
text => $t,
});
}
1;