Bivio::Search::Parser
# Copyright (c) 2008-2011 bivio Software, Inc. All Rights Reserved. # $Id$ package Bivio::Search::Parser; use strict; use Bivio::Base 'Collection.Attributes'; my($_DT) = b_use('Type.DateTime'); my($_M) = b_use('Biz.Model'); my($_P) = b_use('Search.Parseable'); my($_S) = b_use('Type.String'); my($_D) = b_use('Bivio.Die'); sub handle_new_excerpt { my($self, $parseable) = @_; $self = $self->new unless ref($self); $self->handle_new_text($parseable) unless $self->unsafe_get('text'); return $self->put(excerpt => $parseable->get_excerpt); } sub handle_new_text { my($self, $parseable) = @_; $self = $self->new unless ref($self); return $self->put(text => $parseable->get_content); } sub new_text { return _do(@_); } sub new_excerpt { return _do(@_); } sub xapian_posting_synonyms { return []; } sub xapian_terms_and_postings { my($proto, $model) = @_; return unless my $self = $proto->new_excerpt($model); return $self->put( terms => _terms($self), postings => _postings( \($self->get('path')), \($self->get('title')), $self->get('text'), ), ); } sub _do { my($proto, $model) = @_; my($parseable) = $_P->is_blesser_of($model) ? $model : $_P->new($model); $model = $parseable->get('model'); my($method) = 'handle_' . $proto->my_caller; my($die); my($self) = $_D->catch( sub { return b_use(SearchParser => $parseable->get('class')) ->$method($parseable); }, \$die, ); b_warn('Could not parse file:', $die->get('attrs')) if $die; $self ||= $proto->new(); $parseable->map_each( sub { shift; return $self->put_unless_exists(@_); }, ); my($no_text) = ''; $self->put_unless_exists( 'RealmOwner.realm_id' => $model->get_auth_id, req => $parseable->req, author => '', author_email => '', author_user_id => $model->get_auth_user_id, excerpt => '', modified_date_time => sub { return $model->unsafe_get('modified_date_time') || $_DT->now; }, path => '', primary_id => $model->get_primary_id, simple_class => $model->simple_package_name, type => 'unparsed', title => '', text => \$no_text, ); foreach my $v (values(%{$self->internal_get})) { $_S->canonicalize_charset(\$v) unless ref($v); } return $self; } sub _field_term { my($m, $f, $t) = @_; ($t = $f) =~ s/[^a-z]//ig unless $t; return 'X' . uc($t) . ':' . lc($m->get_or_default($f, '')); } sub _omega_terms { my($self) = @_; my($d) = $_DT->to_local_file_name($self->get('modified_date_time')); return ( # Q set by caller, since used in general to delete/add docs 'S' . lc($self->get('title')), 'T' . lc($self->get('content_type')), 'P' . lc($self->get_or_default('path', '')), map({ my($t, $l) = split(//, $_); $t . substr($d, 0, $l); } qw(D8 M6 Y4)), ); } sub _postings { use bytes; return [ map( map( map( length($_) ? lc($_) : (), $_ =~ /^[\W_]*((?:[A-Z]\.){2,10})[\W_]*$/ ? $1 : split(/[\W_]+/, $_), ), split(' ', ${$_S->canonicalize_charset($_)}), ), @_, ), ]; } sub _terms { my($self) = @_; return [ _field_term($self, 'realm_id'), _field_term($self, 'user_id'), _field_term($self, 'is_public'), _field_term($self, 'simple_class'), _omega_terms($self), ]; } 1;