Bivio::Test::HTMLParser::Tables
# Copyright (c) 2002-2007 bivio Software, Inc. All Rights Reserved. # $Id$ package Bivio::Test::HTMLParser::Tables; use strict; use Bivio::Base 'Bivio::Test::HTMLParser'; use Bivio::IO::Trace; use Bivio::Test::HTMLParser::Tables::Cell; my($_IDI) = __PACKAGE__->instance_data_index; __PACKAGE__->register(['Cleaner']); sub do_rows { my($self, $table_name, $do_rows_callback) = @_; # Iterates over the rows over I<table_name>, calling # L<do_rows_callback|"do_rows_callback"> for each row. # # The special field C<_row_index> is set to the value of the index of that row. my($index) = -1; my($t) = _assert_table($self, $table_name); foreach my $row (@{$t->{rows}}) { my($i) = -1; $index++; last unless $do_rows_callback->( { _row_index => $index, map({ my($value) = $row->[++$i]; defined($value) ? ($_->get('text') => $value) : (); } @{$t->{headings}}), }, $index, ); } return; } sub find_row { my($self) = shift; # Return the hash_ref of the the row where the value in I<column_name> # matches I<column_value>. Dies if row not found. # # If I<table_name> not supplied, calls L<get_by_headings|"get_by_headings"> with # I<column_name> for table. my($table_name) = shift if @_ > 2; my($column_name, $column_value) = @_; $table_name = $self->get_by_headings( defined($table_name) ? $table_name : $column_name, )->{headings}->[0]->get('text') if !defined($table_name) || ref($table_name); my($found_row); $column_name = _assert_column($self, $table_name, $column_name); my($misses) = []; $self->do_rows($table_name, sub { my($row) = @_; # not all rows have all columns defined return 1 unless exists($row->{$column_name}); my($t) = $row->{$column_name}->get('text'); push(@$misses, $t); $found_row = $row if _eq($column_value, $t); return $found_row ? 0 : 1; }); Bivio::Die->die( $column_value, ': not found in column "', $column_name, '" values: ', $misses, ) unless $found_row; return wantarray ? ($found_row, $column_name) : $found_row; } sub get_by_headings { my($self, @name) = @_; # Returns the table data by finding by I<name>(s) in heading fields. my($found); my($tables) = $self->get_shallow_copy; TABLE: while (my($table, $values) = each(%$tables)) { foreach my $n (@name) { next TABLE unless grep(_eq($n, $_->get('text')), @{$values->{headings}}); } Bivio::Die->die(\@name, ': too many tables matched headings') if $found; $found = $values; } return $found || Bivio::Die->die(\@name, ': no table matches named headings'); } sub html_parser_end { my($self, $tag) = (shift, @_); # Dispatch to the _end_XXX routines. my($fields) = $self->[$_IDI]; $fields->{links}->html_parser_end(@_) if $fields->{links}; _call_op('end', $tag, $self); return; } sub html_parser_start { my($self, $tag, $attr) = (shift, @_); # Calls _fixup_attr then dispatches to the _start_XXX routines. my($fields) = $self->[$_IDI]; $fields->{links}->html_parser_start(@_) if $fields->{links}; return if _call_op('start', $tag, $self, $attr); return _start_input($self, $attr) if $attr->{type}; return; } sub html_parser_text { my($self, $text) = (shift, @_); $text = $self->get('cleaner')->unescape_text($text); # Parses the tables. Called internally. my($fields) = $self->[$_IDI]; $fields->{links}->html_parser_text(@_) if $fields->{links}; return unless $fields->{in_data_table}; $fields->{text} .= $text; return; } sub new { my($proto, $parser) = @_; return undef if $parser->is_not_bivio_html; # Parses cleaned html for forms. my($self) = $proto->SUPER::new; $self->[$_IDI] = {}; return $self; } sub _assert_column { my($self, $table_name, $column_name) = @_; my($table) = _assert_table($self, $table_name); my(@match) = grep(_eq($column_name, $_->get('text')), @{$table->{headings}}); Bivio::Die->die($column_name, ': column name not found') unless @match; Bivio::Die->die(\@match, ': too many columns found for ', $column_name) #TODO: There's an odd test case which requires this $column_name condition if $column_name && @match > 1; return $match[0]->get('text'); } sub _assert_table { my($self, $table_name) = @_; Bivio::Die->die($table_name, ': table not found; tables=', $self->get_keys) unless $self->unsafe_get($table_name); return $self->get($table_name); } sub _call_op { my($prefix, $tag, @arg) = @_; # Calls _$prefix_$tag if it is defined. my($op) = \&{"_$prefix" . "_$tag"}; return 0 unless defined(&$op); $op->(@arg); return 1; } sub _delete_empty_rows { my($rows) = @_; # Deletes totally empty rows from the table. They are probably separator # rows. for (my($i) = 0; $i < @$rows; $i++) { next if grep(defined($_) && length($_->get('text')), @{$rows->[$i]}); _trace($rows->[$i]) if $_TRACE; splice(@$rows, $i--, 1) } return; } sub _end_table { my($self) = @_; # The only tables we track are "data" tables. my($fields) = $self->[$_IDI]; return unless $fields->{in_data_table} && !--$fields->{in_data_table}; # Delete totally empty rows (probably separators) _delete_empty_rows($fields->{table}->{rows}); my($elements) = $self->get('elements'); my($name) = $fields->{table}->{label} ||= '_anon#' . keys(%{$self->get('elements')}); if ($elements->{$name}) { my($count) = 1; while ($elements->{$name . '#' . $count}) { $count++; } $name .= '#' . $count; } $elements->{$name} = $fields->{table}; _trace($fields->{table}) if $_TRACE; delete($fields->{table}); return; } sub _end_td { my($self) = @_; # Adds the text from column to current row my($fields) = $self->[$_IDI]; return unless $fields->{table} && @{$fields->{table}->{rows}}; _save_cell($self, $fields, $fields->{table}->{rows}->[$#{$fields->{table}->{rows}}]); return; } sub _end_th { my($self) = @_; # Ends the "th". Saves the cell and id for table (if not already there). my($fields) = $self->[$_IDI]; return unless $fields->{table}; my($t) = _save_cell($self, $fields, $fields->{table}->{headings}); $fields->{table}->{label} ||= $t; return; } sub _eq { my($expect, $actual) = @_; return ref($expect) ? $actual =~ $expect : $actual eq $expect; } sub _found_table { my($fields, $id) = @_; # Either at <table id=xxx> or at every <th>. Returns true if # initializes table. unless ($fields->{in_data_table}) { $fields->{in_data_table}++; $fields->{table} = { headings => [], rows => [], label => $id, }; } # elsif ($fields->{in_data_table} > 1) { # die('nested data tables not supported'); # } return; } sub _in_data { my($fields) = @_; # Returns true if at data table level. return ($fields->{in_data_table} || 0) == 1 ? 1 : 0; } sub _links { my($self) = @_; my($fields) = $self->[$_IDI]; return $fields->{links} ||= Bivio::Test::HTMLParser::Links->new->internal_put({ cleaner => $self->get('cleaner'), elements => {}, }); } sub _save_cell { my($self, $fields, $row) = @_; # Checks colspan to see if needs filling. Returns the found text, # if any. return unless $fields->{in_data_table} == 1; my($t) = $self->get('cleaner')->text(_text($fields)); push(@$row, Bivio::Test::HTMLParser::Tables::Cell->new({ text => $t, Links => _links($self)->internal_put( _links($self)->get('elements'))->set_read_only, })); $fields->{links} = undef; _trace($t) if $_TRACE; push(@$row, undef) while --$fields->{colspan} > 0; return $t; } sub _start_input { my($self, $attr) = @_; # Saves "value" attribute. my($fields) = $self->[$_IDI]; $fields->{text} .= $attr->{value} || ''; return; } sub _start_table { my($self, $attr) = @_; # Increments in_data_table my($fields) = $self->[$_IDI]; $fields->{in_data_table}++ if $fields->{in_data_table}; _found_table($fields, $attr->{id}) if $attr->{id}; return; } sub _start_td { my($self, $attr) = @_; # Starts a TD. my($fields) = $self->[$_IDI]; # Don't separate cells in nested table #TODO: Format like a table, e.g. </td> -> ' ', </tr> -> \n return unless _in_data($fields); $fields->{text} = ''; $fields->{colspan} = $attr->{colspan} || 1; _links($self); return; } sub _start_th { my($self, $attrs) = @_; # Starts a TH and initializes {table} if necessary. my($fields) = $self->[$_IDI]; _found_table($fields); return _start_td(@_); } sub _start_tr { my($self, $attr) = @_; # Only adds rows if rows has been initialized. my($fields) = $self->[$_IDI]; return unless _in_data($fields); push(@{$fields->{table}->{rows}}, []) if $fields->{table}->{rows}; return; } sub _text { my($fields) = @_; # Returns the cleaned text field or an empty string if not defined. my($res) = defined($fields->{text}) ? $fields->{text} : ''; $fields->{text} = undef; return $res; } 1;