mourningdove/bin/worker/sphinx-search-gm

287 lines
11 KiB
Text
Raw Permalink Normal View History

2026-05-24 01:03:05 +00:00
#!/usr/bin/perl
#
# sphinx-search-gm
#
# This Gearman worker is responsible for taking a search and issuing it to the
# Sphinx searchd.
#
# Authors:
# Mark Smith <mark@dreamwidth.org>
#
# Copyright (c) 2009-2013 by Dreamwidth Studios, LLC.
#
# This program is free software; you may redistribute it and/or modify it under
# the same terms as Perl itself. For a copy of the license, please reference
# 'perldoc perlartistic' or 'perldoc perlgpl'.
#
use strict;
BEGIN {
require "$ENV{LJHOME}/cgi-bin/ljlib.pl";
}
use Gearman::Worker;
use LJ::Worker::Gearman;
use Sphinx::Search;
use Storable;
gearman_decl( 'sphinx_search' => \&sphinx_search );
gearman_work();
sub _run_search {
my ( $sx, $args ) = @_;
my $index = '*';
$sx->SetServer( @LJ::SPHINX_SEARCHD );
$sx->SetEncoders( sub { shift }, sub { shift } );
$sx->SetMatchMode( SPH_MATCH_ALL )
->SetSortMode( SPH_SORT_RELEVANCE )
->SetMaxQueryTime( 15_000 )
->SetLimits( $args->{offset} || 0, 20 );
# adjust the match mode if there are quotes around the entire query
if ( $args->{query} =~ s/^['"](.+)['"]$/$1/ ) {
$sx->SetMatchMode( SPH_MATCH_PHRASE );
}
# SUPPORT LOG SEARCH OPTIONS
if ( $args->{support} ) {
# None now. Let Sphinx search and sort by relevance. Security is
# handled at the viewing layer.
$index = 'dwsupport';
# Sort newest content first.
$sx->SetSortMode( SPH_SORT_ATTR_DESC, 'touchtime' );
# ENTRY/COMMENT SEARCH OPTIONS
} else {
$index = 'dw1,dw1delta';
# setup the sort they've requested
if ( $args->{sort_by} eq 'new' ) {
$sx->SetSortMode( SPH_SORT_ATTR_DESC, 'date_posted' );
} elsif ( $args->{sort_by} eq 'old' ) {
$sx->SetSortMode( SPH_SORT_ATTR_ASC, 'date_posted' );
}
# filter to a journal if we have a userid set; else, filter on allow_global_search items
$sx->SetFilter( 'journalid', [ $args->{userid} ] )
if $args->{userid};
$sx->SetFilter( 'allow_global_search', [ 1 ] )
unless $args->{userid};
# we don't want items marked deleted (user is not visible)
$sx->SetFilter( 'is_deleted', [ 0 ] );
# filter in/out comments
$sx->SetFilterRange( 'jtalkid', 0, 0 )
if $args->{include_comments} == 0;
# security filtering is a dangerous game. basically, the caller tells us whether to
# ignore security or gives us a mask of bits to work with. from that we intuit what
# security options to enable on our filters to Sphinx.
unless ( $args->{ignore_security} ) {
# allow public posts and anything the mask allows
my @bits = ( 102, LJ::bit_breakdown( $args->{allowmask} ) );
$sx->SetFilter( 'security_bits', \@bits );
# private entries should only be viewable when we choose to ignore security
# this works around some data where the entry is marked in sphinx
# as being both private and having an allowmask
$sx->SetFilter( 'security_bits', [ 0 ], 1 );
}
}
return $sx->Query( $args->{query}, $index );
}
sub _build_output_support {
my ( $sx, $query, $res, $remoteid ) = @_;
return $res if $res->{total} <= 0;
my $dbr = LJ::get_db_reader()
or return;
my $remote = LJ::load_userid( $remoteid )
or return;
# this is weird, I push the hashrefs onto @out from $res->{matches} for
# convenience only... they're the same hashrefs you know and love
my @out;
my %spcache;
foreach my $match ( @{ $res->{matches} } ) {
# Yes, we have to use raw SQL here...
my ( $spid, $type, $content ) = $dbr->selectrow_array(
q{SELECT spid, type, message FROM supportlog WHERE splid = ?},
undef, $match->{doc}
);
next if $dbr->err;
# Fetch the request (with caching, as often terms are repeated in requests)
my $sp = ( $spcache{$spid} ||= LJ::Support::load_request( $spid ) )
or next;
# Now, security check this item...
my $visible = LJ::Support::can_read_cat( $sp->{_cat}, $remote );
if ( $type eq 'internal' ) {
$visible = LJ::Support::can_read_internal( $sp, $remote );
} elsif ( $type eq 'screened' ) {
$visible = LJ::Support::can_read_screened( $sp, $remote );
}
next unless $visible;
$match->{url} = "$LJ::SITEROOT/support/see_request?id=" . $spid;
$match->{type} = $type;
$match->{spid} = $spid;
$match->{category} = $sp->{_cat}->{catname};
$match->{subject} = $sp->{subject};
$match->{content} = $content;
push @out, $match;
}
# Build the excerpts for the bodies.
my $exc = $sx->BuildExcerpts( [ map { $_->{content} } @out ], 'dwsupport', $query, {} ) || [];
# if we have a matching number of excerpts to events, then we can determine
# which one goes with which post.
if ( scalar( @out ) == scalar( @$exc ) ) {
foreach my $m ( @out ) {
delete $m->{content};
$m->{excerpt} = shift @$exc;
}
} else {
# something terrible has happened..., user gets no excerpts :(
foreach my $m ( @out ) {
delete $m->{content};
$m->{excerpt} = '(something terrible happened to the excerpts)';
}
}
$res->{matches} = [ grep { exists $_->{excerpt} } @{$res->{matches}} ];
return $res;
}
sub _build_output {
my ( $sx, $query, $res, $remoteid ) = @_;
# try to build some excerpts of these searches, which involves us loading
# up the exact entry contents...
if ( $res->{total} > 0 ) {
# this is weird, I push the hashrefs onto @out from $res->{matches} for
# convenience only... they're the same hashrefs you know and love
my @out;
foreach my $match ( @{ $res->{matches} } ) {
if ( $match->{jtalkid} == 0 ) {
my $entry = LJ::Entry->new( $match->{journalid}, jitemid => $match->{jitemid} );
my $remote = LJ::load_userid( $remoteid );
# check for validity and for security
# we filtered by security earlier, but there's a chance it was changed
# but not yet indexed
if ( $entry && $entry->valid && $entry->visible_to( $remote ) ) {
# use text only version of event for excerpt purposes. best effort.
$match->{entry} = $entry->event_text;
$match->{entry} =~ s#<(?:br|p)\s*/?># #gi;
$match->{entry} = LJ::strip_html( $match->{entry} );
$match->{entry} ||= "(this entry only contains html content)";
# we don't munge the subject... just clean it
$match->{subject} = $entry->subject_text || '(no subject)';
# also useful information that we want for later
$match->{url} = $entry->url;
$match->{tags} = $entry->tag_map;
$match->{security} = $entry->security;
$match->{security} = 'access'
if $match->{security} eq 'usemask' &&
$entry->allowmask == 1;
$match->{eventtime} = $entry->eventtime_mysql;
} else {
# something happened, couldn't get the entry
$match->{entry} = '(sorry, this entry has been deleted or is otherwise unavailable)';
$match->{subject} = 'Entry deleted or unavailable.';
}
push @out, $match;
} elsif ( $match->{jtalkid} > 0 ) {
my $cmt = LJ::Comment->new( $match->{journalid}, jtalkid => $match->{jtalkid} );
my $entry = $cmt->entry;
my $remote = LJ::load_userid( $remoteid );
# check for validity and for security
# we filtered by security earlier, but there's a chance it was changed
# but not yet indexed
if ( $entry && $entry->valid && $entry->visible_to( $remote ) &&
$cmt && $cmt->valid && $cmt->visible_to( $remote ) ) {
# use text only version of event for excerpt purposes. best effort.
$match->{entry} = $cmt->body_text;
$match->{entry} ||= "(this comment only contains html content)";
# we don't munge the subject... just clean it
$match->{subject} = $cmt->subject_text || '(no subject)';
# also useful information that we want for later
$match->{url} = $cmt->url;
$match->{security} = $entry->security;
$match->{security} = 'access'
if $match->{security} eq 'usemask' &&
$entry->allowmask == 1;
$match->{eventtime} = $cmt->{datepost};
} else {
# something happened, couldn't get the comment
$match->{entry} = '(sorry, this comment has been deleted or is otherwise unavailable)';
$match->{subject} = 'Comment deleted or unavailable.';
}
push @out, $match;
}
}
# FIXME: We are using English stemming in this index. We could try to build separate
# stemmed indices for other languages, if we want.
my $exc = $sx->BuildExcerpts( [ map { $_->{entry} } @out ], 'dw1delta', $query, {} ) || [];
my $subj = $sx->BuildExcerpts( [ map { $_->{subject} } @out ], 'dw1delta', $query, {} ) || [];
# if we have a matching number of excerpts to events, then we can determine
# which one goes with which post.
if ( scalar( @out ) == scalar( @$exc ) ) {
foreach my $m ( @out ) {
delete $m->{entry};
$m->{excerpt} = shift @$exc;
$m->{subject} = shift @$subj;
}
} else {
# something terrible has happened..., user gets no excerpts :(
foreach my $m ( @out ) {
delete $m->{entry};
$m->{excerpt} = '(something terrible happened to the excerpts)';
}
}
}
return $res;
}
sub sphinx_search {
my $job = $_[0];
my $args = Storable::thaw( $job->arg ) || {};
return undef unless $args->{query};
my $sx = Sphinx::Search->new();
my $search_results = _run_search( $sx, $args );
return undef unless $search_results;
my $res = $args->{support} ?
_build_output_support( $sx, $args->{query}, $search_results, $args->{remoteid} ) :
_build_output( $sx, $args->{query}, $search_results, $args->{remoteid} );
return Storable::nfreeze( $res );
}