Bivio::Util::CSV
# Copyright (c) 2001-2009 bivio Software, Inc. All Rights reserved.
# $Id$
package Bivio::Util::CSV;
use strict;
use Bivio::Base 'Bivio::ShellUtil';
use POSIX ();
my($_QUOTE) = '"';
my($_END_OF_VALUE) = qr/[\,\n]/;
my($_END_OF_LINE) = qr/\n/;
my($_TA) = b_use('Type.TextArea');
sub USAGE {
return <<'EOF';
usage: b-csv [options] command [args...]
commands:
colrm start [end] -- removes columns like colrm command
from_one_col value -- quotes a value if necessary
from_one_row array_ref -- converts an array to a quoted row
from_rows array_ref -- converts an array of arrays
parse [text.csv [want_line_numbers]] -- returns array of arrays
parse_records [text.csv [want_line_numbers]] -- returns array of hashes
sort_csv -- returns sorted csv from input csv
to_csv_text array -- returns text from array
EOF
}
sub colrm {
my($self, $start, $end) = @_;
# Reads I<input> and deletes columns starting at I<start> and ending at I<end>
# (or end of file). Currently sucks entire file into memory, which can be slow.
$self->usage_error($start, ": bad start")
unless $start =~ /^\d+$/;
$self->usage_error($end, ": bad end")
unless !defined($end) || $end =~ /^\d+$/;
my($res);
foreach my $line (split(/\n/, ${$self->read_input})) {
$self->usage_error("quoted text not supported") if $line =~ /"/;
my(@l) = split(/,/, $line);
defined($end) ? splice(@l, $start, $end) : splice(@l, $start);
$res .= join(',', @l)."\n";
}
return \$res;
}
sub from_one_col {
my(undef, $col) = @_;
return '' unless defined($col);
return $col
unless $col =~ /(?:^\s|\s$|$_QUOTE|$_END_OF_VALUE|\r)/;
$col =~ s/"/""/g;
return qq{"$col"};
}
sub from_one_row {
my($proto, $row) = @_;
return \(join(',', map($proto->from_one_col($_), @$row)) . "\n");
}
sub from_rows {
my($proto, $rows) = @_;
return \(join('', map(${$proto->from_one_row($_)}, @$rows)));
}
sub parse {
my($self, $csv_text, $want_line_numbers) = @_;
# Parses I<csv_text> into an array of array rows. if I<csv_text> not supplied,
# read_input is called. I<csv_text> may also be a string (need not be a ref).
#
# Dies on failure with an appropriate message.
#
# If I<want_line_numbers> is specified, then the first item of each row
# will contain the line number from the input text.
my($state) = {
buffer => $_TA->canonicalize_newlines(
!defined($csv_text) ? $self->read_input
: ref($csv_text) ? $csv_text
: _assert_csv_text($self, $csv_text),
),
want_line_numbers => $want_line_numbers,
char_count => 0,
line_number => 1,
current_value => '',
rows => [],
current_row => [$want_line_numbers ? 1 : ()],
};
while (defined(my $char = _next_char($state))) {
if ($char eq $_QUOTE) {
_die($state, 'quote character within unquoted value')
if length($state->{current_value});
while (defined($char = _next_char($state))) {
if ($char eq $_QUOTE) {
if (_peek_char($state) eq $_QUOTE) {
_append_char($state, $char);
_next_char($state);
}
else {
_die($state,
'unexpected character after closing quote')
unless _peek_char($state) =~ $_END_OF_VALUE;
last;
}
}
elsif ($char =~ $_END_OF_LINE) {
_next_line($state);
_append_char($state, "\n");
}
else {
_append_char($state, $char);
}
}
_die($state, 'unterminated quoted value')
unless defined($char);
}
elsif ($char =~ $_END_OF_VALUE) {
_end_value($state);
if ($char =~ $_END_OF_LINE) {
_next_line($state);
_end_row($state);
}
}
else {
_append_char($state, $char);
}
}
# add last row if input is missing end-of-line
if (length($state->{current_value})
|| scalar(@{$state->{current_row}}) > ($want_line_numbers ? 1 : 0)) {
_end_value($state);
_end_row($state);
}
# remove leading and trailing empty rows
while (scalar(@{$state->{rows}})) {
last unless _is_row_empty($state, -1);
pop(@{$state->{rows}});
}
while (scalar(@{$state->{rows}})) {
last unless _is_row_empty($state, 0);
shift(@{$state->{rows}});
}
return $state->{rows};
}
sub parse_records {
my($self, undef, $want_line_numbers, $heading_ref) = @_;
# Parses the CSV data, treating the first row as headings and returns
# an array of hash_ref records.
my($rows) = shift->parse(@_);
return $rows unless @$rows;
my($heading) = shift(@$rows);
@$heading_ref = @$heading
if $heading_ref;
$heading->[0] = '_line'
if $want_line_numbers;
return [
map({
my($row) = $_;
+{
map(($_, shift(@$row)), @$heading),
};
} @$rows),
];
}
sub sort_csv {
my($self, $csv_text) = @_;
my($headings) = [];
my($rows) = $self->parse_records($csv_text, 0, $headings);
my($h0) = $headings->[0];
return join('',
${$self->to_csv_text($headings)},
map({
my($row) = $_;
${$self->to_csv_text([map($row->{$_}, @$headings)])};
} sort({POSIX::strcoll($a->{$h0}, $b->{$h0})} @$rows)),
);
}
sub to_csv_text {
my($proto, $list) = @_;
my($method) = @$list && ref($list->[0]) ? 'from_rows' : 'from_one_row';
return $proto->$method($list);
}
sub _append_char {
my($state, $char) = @_;
# Appends a character to the current value.
$state->{current_value} .= $char;
return;
}
sub _assert_csv_text {
my($self, $csv_text) = @_;
$self->usage_error($csv_text, ': must be CSV, not file name')
unless $csv_text =~ m{\n} || $csv_text !~ m{/};
return \$csv_text;
}
sub _die {
my($state, @mesesage) = @_;
# Dies with the specified message. Includes the line number.
Bivio::Die->die('line: ', $state->{line_number}, ' ', @mesesage);
}
sub _end_row {
my($state) = @_;
# Ends the current parsed row.
push(@{$state->{rows}}, $state->{current_row});
$state->{current_row} = [
$state->{want_line_numbers} ? $state->{line_number} : ()];
return;
}
sub _end_value {
my($state) = @_;
# Ends the current parsed value.
push(@{$state->{current_row}}, $state->{current_value});
$state->{current_value} = '';
return;
}
sub _is_row_empty {
my($state, $index) = @_;
# Returns true if the row is empty, or contains a single entry composed
# of space.
return scalar(@{$state->{rows}->[$index]})
> ($state->{want_line_numbers} ? 2 : 1)
|| $state->{rows}->[$index]->[$state->{want_line_numbers} ? 1 : 0]
=~ /\S/
? 0 : 1;
}
sub _next_char {
my($state) = @_;
# Returns the next character, or undef if at the end of input.
return $state->{char_count} > length(${$state->{buffer}})
? undef
: substr(${$state->{buffer}}, $state->{char_count}++, 1);
}
sub _next_line {
my($state) = @_;
$state->{line_number}++;
return;
}
sub _peek_char {
my($state) = @_;
# Return the next character without advancing.
return substr(${$state->{buffer}}, $state->{char_count}, 1);
}
1;