#!/usr/bin/perl # # sphinx-search-gm # # This Gearman worker is responsible for taking a search and issuing it to the # Sphinx searchd. # # Authors: # Mark Smith # # Copyright (c) 2009-2013 by Dreamwidth Studios, LLC. # # This program is free software; you may redistribute it and/or modify it under # the same terms as Perl itself. For a copy of the license, please reference # 'perldoc perlartistic' or 'perldoc perlgpl'. # use strict; BEGIN { require "$ENV{LJHOME}/cgi-bin/ljlib.pl"; } use Gearman::Worker; use LJ::Worker::Gearman; use Sphinx::Search; use Storable; gearman_decl( 'sphinx_search' => \&sphinx_search ); gearman_work(); sub _run_search { my ( $sx, $args ) = @_; my $index = '*'; $sx->SetServer( @LJ::SPHINX_SEARCHD ); $sx->SetEncoders( sub { shift }, sub { shift } ); $sx->SetMatchMode( SPH_MATCH_ALL ) ->SetSortMode( SPH_SORT_RELEVANCE ) ->SetMaxQueryTime( 15_000 ) ->SetLimits( $args->{offset} || 0, 20 ); # adjust the match mode if there are quotes around the entire query if ( $args->{query} =~ s/^['"](.+)['"]$/$1/ ) { $sx->SetMatchMode( SPH_MATCH_PHRASE ); } # SUPPORT LOG SEARCH OPTIONS if ( $args->{support} ) { # None now. Let Sphinx search and sort by relevance. Security is # handled at the viewing layer. $index = 'dwsupport'; # Sort newest content first. $sx->SetSortMode( SPH_SORT_ATTR_DESC, 'touchtime' ); # ENTRY/COMMENT SEARCH OPTIONS } else { $index = 'dw1,dw1delta'; # setup the sort they've requested if ( $args->{sort_by} eq 'new' ) { $sx->SetSortMode( SPH_SORT_ATTR_DESC, 'date_posted' ); } elsif ( $args->{sort_by} eq 'old' ) { $sx->SetSortMode( SPH_SORT_ATTR_ASC, 'date_posted' ); } # filter to a journal if we have a userid set; else, filter on allow_global_search items $sx->SetFilter( 'journalid', [ $args->{userid} ] ) if $args->{userid}; $sx->SetFilter( 'allow_global_search', [ 1 ] ) unless $args->{userid}; # we don't want items marked deleted (user is not visible) $sx->SetFilter( 'is_deleted', [ 0 ] ); # filter in/out comments $sx->SetFilterRange( 'jtalkid', 0, 0 ) if $args->{include_comments} == 0; # security filtering is a dangerous game. basically, the caller tells us whether to # ignore security or gives us a mask of bits to work with. from that we intuit what # security options to enable on our filters to Sphinx. unless ( $args->{ignore_security} ) { # allow public posts and anything the mask allows my @bits = ( 102, LJ::bit_breakdown( $args->{allowmask} ) ); $sx->SetFilter( 'security_bits', \@bits ); # private entries should only be viewable when we choose to ignore security # this works around some data where the entry is marked in sphinx # as being both private and having an allowmask $sx->SetFilter( 'security_bits', [ 0 ], 1 ); } } return $sx->Query( $args->{query}, $index ); } sub _build_output_support { my ( $sx, $query, $res, $remoteid ) = @_; return $res if $res->{total} <= 0; my $dbr = LJ::get_db_reader() or return; my $remote = LJ::load_userid( $remoteid ) or return; # this is weird, I push the hashrefs onto @out from $res->{matches} for # convenience only... they're the same hashrefs you know and love my @out; my %spcache; foreach my $match ( @{ $res->{matches} } ) { # Yes, we have to use raw SQL here... my ( $spid, $type, $content ) = $dbr->selectrow_array( q{SELECT spid, type, message FROM supportlog WHERE splid = ?}, undef, $match->{doc} ); next if $dbr->err; # Fetch the request (with caching, as often terms are repeated in requests) my $sp = ( $spcache{$spid} ||= LJ::Support::load_request( $spid ) ) or next; # Now, security check this item... my $visible = LJ::Support::can_read_cat( $sp->{_cat}, $remote ); if ( $type eq 'internal' ) { $visible = LJ::Support::can_read_internal( $sp, $remote ); } elsif ( $type eq 'screened' ) { $visible = LJ::Support::can_read_screened( $sp, $remote ); } next unless $visible; $match->{url} = "$LJ::SITEROOT/support/see_request?id=" . $spid; $match->{type} = $type; $match->{spid} = $spid; $match->{category} = $sp->{_cat}->{catname}; $match->{subject} = $sp->{subject}; $match->{content} = $content; push @out, $match; } # Build the excerpts for the bodies. my $exc = $sx->BuildExcerpts( [ map { $_->{content} } @out ], 'dwsupport', $query, {} ) || []; # if we have a matching number of excerpts to events, then we can determine # which one goes with which post. if ( scalar( @out ) == scalar( @$exc ) ) { foreach my $m ( @out ) { delete $m->{content}; $m->{excerpt} = shift @$exc; } } else { # something terrible has happened..., user gets no excerpts :( foreach my $m ( @out ) { delete $m->{content}; $m->{excerpt} = '(something terrible happened to the excerpts)'; } } $res->{matches} = [ grep { exists $_->{excerpt} } @{$res->{matches}} ]; return $res; } sub _build_output { my ( $sx, $query, $res, $remoteid ) = @_; # try to build some excerpts of these searches, which involves us loading # up the exact entry contents... if ( $res->{total} > 0 ) { # this is weird, I push the hashrefs onto @out from $res->{matches} for # convenience only... they're the same hashrefs you know and love my @out; foreach my $match ( @{ $res->{matches} } ) { if ( $match->{jtalkid} == 0 ) { my $entry = LJ::Entry->new( $match->{journalid}, jitemid => $match->{jitemid} ); my $remote = LJ::load_userid( $remoteid ); # check for validity and for security # we filtered by security earlier, but there's a chance it was changed # but not yet indexed if ( $entry && $entry->valid && $entry->visible_to( $remote ) ) { # use text only version of event for excerpt purposes. best effort. $match->{entry} = $entry->event_text; $match->{entry} =~ s#<(?:br|p)\s*/?># #gi; $match->{entry} = LJ::strip_html( $match->{entry} ); $match->{entry} ||= "(this entry only contains html content)"; # we don't munge the subject... just clean it $match->{subject} = $entry->subject_text || '(no subject)'; # also useful information that we want for later $match->{url} = $entry->url; $match->{tags} = $entry->tag_map; $match->{security} = $entry->security; $match->{security} = 'access' if $match->{security} eq 'usemask' && $entry->allowmask == 1; $match->{eventtime} = $entry->eventtime_mysql; } else { # something happened, couldn't get the entry $match->{entry} = '(sorry, this entry has been deleted or is otherwise unavailable)'; $match->{subject} = 'Entry deleted or unavailable.'; } push @out, $match; } elsif ( $match->{jtalkid} > 0 ) { my $cmt = LJ::Comment->new( $match->{journalid}, jtalkid => $match->{jtalkid} ); my $entry = $cmt->entry; my $remote = LJ::load_userid( $remoteid ); # check for validity and for security # we filtered by security earlier, but there's a chance it was changed # but not yet indexed if ( $entry && $entry->valid && $entry->visible_to( $remote ) && $cmt && $cmt->valid && $cmt->visible_to( $remote ) ) { # use text only version of event for excerpt purposes. best effort. $match->{entry} = $cmt->body_text; $match->{entry} ||= "(this comment only contains html content)"; # we don't munge the subject... just clean it $match->{subject} = $cmt->subject_text || '(no subject)'; # also useful information that we want for later $match->{url} = $cmt->url; $match->{security} = $entry->security; $match->{security} = 'access' if $match->{security} eq 'usemask' && $entry->allowmask == 1; $match->{eventtime} = $cmt->{datepost}; } else { # something happened, couldn't get the comment $match->{entry} = '(sorry, this comment has been deleted or is otherwise unavailable)'; $match->{subject} = 'Comment deleted or unavailable.'; } push @out, $match; } } # FIXME: We are using English stemming in this index. We could try to build separate # stemmed indices for other languages, if we want. my $exc = $sx->BuildExcerpts( [ map { $_->{entry} } @out ], 'dw1delta', $query, {} ) || []; my $subj = $sx->BuildExcerpts( [ map { $_->{subject} } @out ], 'dw1delta', $query, {} ) || []; # if we have a matching number of excerpts to events, then we can determine # which one goes with which post. if ( scalar( @out ) == scalar( @$exc ) ) { foreach my $m ( @out ) { delete $m->{entry}; $m->{excerpt} = shift @$exc; $m->{subject} = shift @$subj; } } else { # something terrible has happened..., user gets no excerpts :( foreach my $m ( @out ) { delete $m->{entry}; $m->{excerpt} = '(something terrible happened to the excerpts)'; } } } return $res; } sub sphinx_search { my $job = $_[0]; my $args = Storable::thaw( $job->arg ) || {}; return undef unless $args->{query}; my $sx = Sphinx::Search->new(); my $search_results = _run_search( $sx, $args ); return undef unless $search_results; my $res = $args->{support} ? _build_output_support( $sx, $args->{query}, $search_results, $args->{remoteid} ) : _build_output( $sx, $args->{query}, $search_results, $args->{remoteid} ); return Storable::nfreeze( $res ); }