# Copyright (c) 2002-2004 bivio Software, Inc. All Rights Reserved. # # Visit http://www.bivio.biz for more info. # # This library is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation; either version 2.1 of the # License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; If not, you may get a copy from: # http://www.opensource.org/licenses/lgpl-license.html # # $Id: HTMLParser.pm,v 2.5 2010/04/09 15:35:58 nagler Exp $ package Bivio::Test::HTMLParser; use strict; use Bivio::Base 'Collection.Attributes'; # C directs parsing of html by calling classes in the # TestHTMLParser class map. # # html : string # # The HTML which was passed to new # # Esimple_classE : string # # Each parser class is put on I. See parser classes for their attributes. our($VERSION) = sprintf('%d.%02d', q$Revision: 2.5 $ =~ /\d+/g); my(@_CLASSES); b_use('IO.ClassLoader')->map_require_all('TestHTMLParser'); my($_HP) = b_use('Ext.HTMLParser'); sub html_parser_comment { return; } sub html_parser_end { return; } sub html_parser_start { return; } sub html_parser_text { return; } sub internal_new { # (proto, Test.HTMLParser) : Test.HTMLParser # Calls parser subclass to parse cleaned html. Subclass must implement # L interface. Sets two # attributes: I and I. I is an instance of # C, and I is a hash which will be put as the attributes of # I when parsing is complete. my($proto, $parser) = @_; my($self) = $proto->new; $self->internal_put({ cleaner => $parser->get('Cleaner'), elements => {}, }); my($p) = $_HP->new($self); $p->ignore_elements(qw(script style)); $p->parse($self->get('cleaner')->get('html')); $self->internal_put($self->get('elements')); return $self->set_read_only; } sub new { # (proto, string_ref) : Test.HTMLParser # (proto, hash_ref) : Test.HTMLParser # Parse I using registered parser classes. # # If I is undef or I is passed, does nothing (pass through # L for subclasses). my($proto) = shift; return $proto->SUPER::new(@_) unless (ref($proto) || $proto) eq __PACKAGE__; my($html) = shift; my($self) = $proto->SUPER::new({html => $$html}); foreach my $c (@_CLASSES) { $self->put($c->simple_package_name => $c->internal_new($self)); } return $self->set_read_only; } sub register { # (proto, array_ref) : undef # Adds I to list of classes, but first loads I. my($proto, $prerequisite_classes) = @_; foreach my $p (@{$prerequisite_classes || []}) { b_use('TestHTMLParser', $p); } push(@_CLASSES, ref($proto) || $proto); return; } 1;