286 lines
11 KiB
Perl
Executable file
286 lines
11 KiB
Perl
Executable file
#!/usr/bin/perl
|
|
#
|
|
# sphinx-search-gm
|
|
#
|
|
# This Gearman worker is responsible for taking a search and issuing it to the
|
|
# Sphinx searchd.
|
|
#
|
|
# Authors:
|
|
# Mark Smith <mark@dreamwidth.org>
|
|
#
|
|
# Copyright (c) 2009-2013 by Dreamwidth Studios, LLC.
|
|
#
|
|
# This program is free software; you may redistribute it and/or modify it under
|
|
# the same terms as Perl itself. For a copy of the license, please reference
|
|
# 'perldoc perlartistic' or 'perldoc perlgpl'.
|
|
#
|
|
|
|
use strict;
|
|
BEGIN {
|
|
require "$ENV{LJHOME}/cgi-bin/ljlib.pl";
|
|
}
|
|
|
|
use Gearman::Worker;
|
|
use LJ::Worker::Gearman;
|
|
use Sphinx::Search;
|
|
use Storable;
|
|
|
|
gearman_decl( 'sphinx_search' => \&sphinx_search );
|
|
gearman_work();
|
|
|
|
sub _run_search {
|
|
my ( $sx, $args ) = @_;
|
|
|
|
my $index = '*';
|
|
$sx->SetServer( @LJ::SPHINX_SEARCHD );
|
|
|
|
$sx->SetEncoders( sub { shift }, sub { shift } );
|
|
|
|
$sx->SetMatchMode( SPH_MATCH_ALL )
|
|
->SetSortMode( SPH_SORT_RELEVANCE )
|
|
->SetMaxQueryTime( 15_000 )
|
|
->SetLimits( $args->{offset} || 0, 20 );
|
|
|
|
# adjust the match mode if there are quotes around the entire query
|
|
if ( $args->{query} =~ s/^['"](.+)['"]$/$1/ ) {
|
|
$sx->SetMatchMode( SPH_MATCH_PHRASE );
|
|
}
|
|
|
|
# SUPPORT LOG SEARCH OPTIONS
|
|
if ( $args->{support} ) {
|
|
# None now. Let Sphinx search and sort by relevance. Security is
|
|
# handled at the viewing layer.
|
|
$index = 'dwsupport';
|
|
|
|
# Sort newest content first.
|
|
$sx->SetSortMode( SPH_SORT_ATTR_DESC, 'touchtime' );
|
|
|
|
# ENTRY/COMMENT SEARCH OPTIONS
|
|
} else {
|
|
$index = 'dw1,dw1delta';
|
|
|
|
# setup the sort they've requested
|
|
if ( $args->{sort_by} eq 'new' ) {
|
|
$sx->SetSortMode( SPH_SORT_ATTR_DESC, 'date_posted' );
|
|
} elsif ( $args->{sort_by} eq 'old' ) {
|
|
$sx->SetSortMode( SPH_SORT_ATTR_ASC, 'date_posted' );
|
|
}
|
|
|
|
# filter to a journal if we have a userid set; else, filter on allow_global_search items
|
|
$sx->SetFilter( 'journalid', [ $args->{userid} ] )
|
|
if $args->{userid};
|
|
$sx->SetFilter( 'allow_global_search', [ 1 ] )
|
|
unless $args->{userid};
|
|
|
|
# we don't want items marked deleted (user is not visible)
|
|
$sx->SetFilter( 'is_deleted', [ 0 ] );
|
|
|
|
# filter in/out comments
|
|
$sx->SetFilterRange( 'jtalkid', 0, 0 )
|
|
if $args->{include_comments} == 0;
|
|
|
|
# security filtering is a dangerous game. basically, the caller tells us whether to
|
|
# ignore security or gives us a mask of bits to work with. from that we intuit what
|
|
# security options to enable on our filters to Sphinx.
|
|
unless ( $args->{ignore_security} ) {
|
|
# allow public posts and anything the mask allows
|
|
my @bits = ( 102, LJ::bit_breakdown( $args->{allowmask} ) );
|
|
$sx->SetFilter( 'security_bits', \@bits );
|
|
|
|
# private entries should only be viewable when we choose to ignore security
|
|
# this works around some data where the entry is marked in sphinx
|
|
# as being both private and having an allowmask
|
|
$sx->SetFilter( 'security_bits', [ 0 ], 1 );
|
|
}
|
|
}
|
|
|
|
return $sx->Query( $args->{query}, $index );
|
|
}
|
|
|
|
sub _build_output_support {
|
|
my ( $sx, $query, $res, $remoteid ) = @_;
|
|
return $res if $res->{total} <= 0;
|
|
|
|
my $dbr = LJ::get_db_reader()
|
|
or return;
|
|
my $remote = LJ::load_userid( $remoteid )
|
|
or return;
|
|
|
|
# this is weird, I push the hashrefs onto @out from $res->{matches} for
|
|
# convenience only... they're the same hashrefs you know and love
|
|
my @out;
|
|
|
|
my %spcache;
|
|
foreach my $match ( @{ $res->{matches} } ) {
|
|
# Yes, we have to use raw SQL here...
|
|
my ( $spid, $type, $content ) = $dbr->selectrow_array(
|
|
q{SELECT spid, type, message FROM supportlog WHERE splid = ?},
|
|
undef, $match->{doc}
|
|
);
|
|
next if $dbr->err;
|
|
|
|
# Fetch the request (with caching, as often terms are repeated in requests)
|
|
my $sp = ( $spcache{$spid} ||= LJ::Support::load_request( $spid ) )
|
|
or next;
|
|
|
|
# Now, security check this item...
|
|
my $visible = LJ::Support::can_read_cat( $sp->{_cat}, $remote );
|
|
if ( $type eq 'internal' ) {
|
|
$visible = LJ::Support::can_read_internal( $sp, $remote );
|
|
} elsif ( $type eq 'screened' ) {
|
|
$visible = LJ::Support::can_read_screened( $sp, $remote );
|
|
}
|
|
next unless $visible;
|
|
|
|
$match->{url} = "$LJ::SITEROOT/support/see_request?id=" . $spid;
|
|
$match->{type} = $type;
|
|
$match->{spid} = $spid;
|
|
$match->{category} = $sp->{_cat}->{catname};
|
|
$match->{subject} = $sp->{subject};
|
|
$match->{content} = $content;
|
|
push @out, $match;
|
|
}
|
|
|
|
# Build the excerpts for the bodies.
|
|
my $exc = $sx->BuildExcerpts( [ map { $_->{content} } @out ], 'dwsupport', $query, {} ) || [];
|
|
|
|
# if we have a matching number of excerpts to events, then we can determine
|
|
# which one goes with which post.
|
|
if ( scalar( @out ) == scalar( @$exc ) ) {
|
|
foreach my $m ( @out ) {
|
|
delete $m->{content};
|
|
$m->{excerpt} = shift @$exc;
|
|
}
|
|
|
|
} else {
|
|
# something terrible has happened..., user gets no excerpts :(
|
|
foreach my $m ( @out ) {
|
|
delete $m->{content};
|
|
$m->{excerpt} = '(something terrible happened to the excerpts)';
|
|
}
|
|
}
|
|
|
|
$res->{matches} = [ grep { exists $_->{excerpt} } @{$res->{matches}} ];
|
|
return $res;
|
|
}
|
|
|
|
sub _build_output {
|
|
my ( $sx, $query, $res, $remoteid ) = @_;
|
|
|
|
# try to build some excerpts of these searches, which involves us loading
|
|
# up the exact entry contents...
|
|
if ( $res->{total} > 0 ) {
|
|
|
|
# this is weird, I push the hashrefs onto @out from $res->{matches} for
|
|
# convenience only... they're the same hashrefs you know and love
|
|
my @out;
|
|
|
|
foreach my $match ( @{ $res->{matches} } ) {
|
|
if ( $match->{jtalkid} == 0 ) {
|
|
my $entry = LJ::Entry->new( $match->{journalid}, jitemid => $match->{jitemid} );
|
|
my $remote = LJ::load_userid( $remoteid );
|
|
|
|
# check for validity and for security
|
|
# we filtered by security earlier, but there's a chance it was changed
|
|
# but not yet indexed
|
|
if ( $entry && $entry->valid && $entry->visible_to( $remote ) ) {
|
|
# use text only version of event for excerpt purposes. best effort.
|
|
$match->{entry} = $entry->event_text;
|
|
$match->{entry} =~ s#<(?:br|p)\s*/?># #gi;
|
|
$match->{entry} = LJ::strip_html( $match->{entry} );
|
|
$match->{entry} ||= "(this entry only contains html content)";
|
|
|
|
# we don't munge the subject... just clean it
|
|
$match->{subject} = $entry->subject_text || '(no subject)';
|
|
|
|
# also useful information that we want for later
|
|
$match->{url} = $entry->url;
|
|
$match->{tags} = $entry->tag_map;
|
|
$match->{security} = $entry->security;
|
|
$match->{security} = 'access'
|
|
if $match->{security} eq 'usemask' &&
|
|
$entry->allowmask == 1;
|
|
$match->{eventtime} = $entry->eventtime_mysql;
|
|
|
|
} else {
|
|
# something happened, couldn't get the entry
|
|
$match->{entry} = '(sorry, this entry has been deleted or is otherwise unavailable)';
|
|
$match->{subject} = 'Entry deleted or unavailable.';
|
|
}
|
|
push @out, $match;
|
|
} elsif ( $match->{jtalkid} > 0 ) {
|
|
my $cmt = LJ::Comment->new( $match->{journalid}, jtalkid => $match->{jtalkid} );
|
|
my $entry = $cmt->entry;
|
|
my $remote = LJ::load_userid( $remoteid );
|
|
|
|
# check for validity and for security
|
|
# we filtered by security earlier, but there's a chance it was changed
|
|
# but not yet indexed
|
|
if ( $entry && $entry->valid && $entry->visible_to( $remote ) &&
|
|
$cmt && $cmt->valid && $cmt->visible_to( $remote ) ) {
|
|
# use text only version of event for excerpt purposes. best effort.
|
|
$match->{entry} = $cmt->body_text;
|
|
$match->{entry} ||= "(this comment only contains html content)";
|
|
|
|
# we don't munge the subject... just clean it
|
|
$match->{subject} = $cmt->subject_text || '(no subject)';
|
|
|
|
# also useful information that we want for later
|
|
$match->{url} = $cmt->url;
|
|
$match->{security} = $entry->security;
|
|
$match->{security} = 'access'
|
|
if $match->{security} eq 'usemask' &&
|
|
$entry->allowmask == 1;
|
|
$match->{eventtime} = $cmt->{datepost};
|
|
|
|
} else {
|
|
# something happened, couldn't get the comment
|
|
$match->{entry} = '(sorry, this comment has been deleted or is otherwise unavailable)';
|
|
$match->{subject} = 'Comment deleted or unavailable.';
|
|
}
|
|
push @out, $match;
|
|
}
|
|
}
|
|
|
|
# FIXME: We are using English stemming in this index. We could try to build separate
|
|
# stemmed indices for other languages, if we want.
|
|
my $exc = $sx->BuildExcerpts( [ map { $_->{entry} } @out ], 'dw1delta', $query, {} ) || [];
|
|
my $subj = $sx->BuildExcerpts( [ map { $_->{subject} } @out ], 'dw1delta', $query, {} ) || [];
|
|
|
|
# if we have a matching number of excerpts to events, then we can determine
|
|
# which one goes with which post.
|
|
if ( scalar( @out ) == scalar( @$exc ) ) {
|
|
foreach my $m ( @out ) {
|
|
delete $m->{entry};
|
|
$m->{excerpt} = shift @$exc;
|
|
$m->{subject} = shift @$subj;
|
|
}
|
|
|
|
} else {
|
|
# something terrible has happened..., user gets no excerpts :(
|
|
foreach my $m ( @out ) {
|
|
delete $m->{entry};
|
|
$m->{excerpt} = '(something terrible happened to the excerpts)';
|
|
}
|
|
}
|
|
}
|
|
|
|
return $res;
|
|
}
|
|
|
|
sub sphinx_search {
|
|
my $job = $_[0];
|
|
|
|
my $args = Storable::thaw( $job->arg ) || {};
|
|
return undef unless $args->{query};
|
|
|
|
my $sx = Sphinx::Search->new();
|
|
my $search_results = _run_search( $sx, $args );
|
|
return undef unless $search_results;
|
|
|
|
my $res = $args->{support} ?
|
|
_build_output_support( $sx, $args->{query}, $search_results, $args->{remoteid} ) :
|
|
_build_output( $sx, $args->{query}, $search_results, $args->{remoteid} );
|
|
return Storable::nfreeze( $res );
|
|
}
|
|
|