# Copyright (c) 2008 bivio Software, Inc. All Rights Reserved. # # Visit http://www.bivio.biz for more info. # # This library is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation; either version 2.1 of the # License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; If not, you may get a copy from: # http://www.opensource.org/licenses/lgpl-license.html # # $Id: TextHTML.pm,v 1.2 2008/06/17 02:06:43 nagler Exp $ package Bivio::Search::Parser::RealmFile::TextHTML; use strict; use Bivio::Base 'SearchParser.RealmFile'; our($VERSION) = sprintf('%d.%02d', q$Revision: 1.2 $ =~ /\d+/g); sub CONTENT_TYPE_LIST { return 'text/html'; } sub handle_realm_file_new_text { my($proto, $parseable) = @_; my($t) = $parseable->get_content; $$t =~ s{([^<]+)}{}is; my($title) = $1; $$t =~ s/]*>|]*>\s*( ?)*]*>/ PARAGRAPH_SPLIT_HERE /isg; $title =~ s/^\s+|\s+$//gs if defined($title); $t = $proto->use('HTML.Scraper')->to_text($t); $$t =~ s/\s+/ /sg; $$t =~ s/ *\bPARAGRAPH_SPLIT_HERE\b */\n\n/sg; return $proto->new({ type => 'text/html', defined($title) && length($title) ? (title => $title) : (), text => $t, }); } 1;