#!/usr/bin/perl
#
# sphinx-search-gm
#
# This Gearman worker is responsible for taking a search and issuing it to the
# Sphinx searchd.
#
# Authors:
#      Mark Smith <mark@dreamwidth.org>
#
# Copyright (c) 2009-2013 by Dreamwidth Studios, LLC.
#
# This program is free software; you may redistribute it and/or modify it under
# the same terms as Perl itself.  For a copy of the license, please reference
# 'perldoc perlartistic' or 'perldoc perlgpl'.
#

use strict;
BEGIN {
    require "$ENV{LJHOME}/cgi-bin/ljlib.pl";
}

use Gearman::Worker;
use LJ::Worker::Gearman;
use Sphinx::Search;
use Storable;

gearman_decl( 'sphinx_search'  => \&sphinx_search );
gearman_work();

sub _run_search {
    my ( $sx, $args ) = @_;

    my $index = '*';
    $sx->SetServer( @LJ::SPHINX_SEARCHD );

    $sx->SetEncoders( sub { shift }, sub { shift } );

    $sx->SetMatchMode( SPH_MATCH_ALL )
       ->SetSortMode( SPH_SORT_RELEVANCE )
       ->SetMaxQueryTime( 15_000 )
       ->SetLimits( $args->{offset} || 0, 20 );

    # adjust the match mode if there are quotes around the entire query
    if ( $args->{query} =~ s/^['"](.+)['"]$/$1/ ) {
        $sx->SetMatchMode( SPH_MATCH_PHRASE );
    }

    # SUPPORT LOG SEARCH OPTIONS
    if ( $args->{support} ) {
        # None now. Let Sphinx search and sort by relevance. Security is
        # handled at the viewing layer.
        $index = 'dwsupport';

        # Sort newest content first.
        $sx->SetSortMode( SPH_SORT_ATTR_DESC, 'touchtime' );

    # ENTRY/COMMENT SEARCH OPTIONS
    } else {
        $index = 'dw1,dw1delta';

        # setup the sort they've requested
        if ( $args->{sort_by} eq 'new' ) {
            $sx->SetSortMode( SPH_SORT_ATTR_DESC, 'date_posted' );
        } elsif ( $args->{sort_by} eq 'old' ) {
            $sx->SetSortMode( SPH_SORT_ATTR_ASC, 'date_posted' );
        }

        # filter to a journal if we have a userid set; else, filter on allow_global_search items
        $sx->SetFilter( 'journalid', [ $args->{userid} ] )
            if $args->{userid};
        $sx->SetFilter( 'allow_global_search', [ 1 ] )
            unless $args->{userid};

        # we don't want items marked deleted (user is not visible)
        $sx->SetFilter( 'is_deleted', [ 0 ] );

        # filter in/out comments
        $sx->SetFilterRange( 'jtalkid', 0, 0 )
            if $args->{include_comments} == 0;

        # security filtering is a dangerous game.  basically, the caller tells us whether to
        # ignore security or gives us a mask of bits to work with.  from that we intuit what
        # security options to enable on our filters to Sphinx.
        unless ( $args->{ignore_security} ) {
            # allow public posts and anything the mask allows
            my @bits = ( 102, LJ::bit_breakdown( $args->{allowmask} ) );
            $sx->SetFilter( 'security_bits', \@bits );

            # private entries should only be viewable when we choose to ignore security
            # this works around some data where the entry is marked in sphinx
            # as being both private and having an allowmask
            $sx->SetFilter( 'security_bits', [ 0 ], 1 );
        }
    }

    return $sx->Query( $args->{query}, $index );
}

sub _build_output_support {
    my ( $sx, $query, $res, $remoteid ) = @_;
    return $res if $res->{total} <= 0;

    my $dbr = LJ::get_db_reader()
        or return;
    my $remote = LJ::load_userid( $remoteid )
        or return;

    # this is weird, I push the hashrefs onto @out from $res->{matches} for
    # convenience only... they're the same hashrefs you know and love
    my @out;

    my %spcache;
    foreach my $match ( @{ $res->{matches} } ) {
        # Yes, we have to use raw SQL here...
        my ( $spid, $type, $content ) = $dbr->selectrow_array(
            q{SELECT spid, type, message FROM supportlog WHERE splid = ?},
            undef, $match->{doc}
        );
        next if $dbr->err;

        # Fetch the request (with caching, as often terms are repeated in requests)
        my $sp = ( $spcache{$spid} ||= LJ::Support::load_request( $spid ) )
            or next;

        # Now, security check this item...
        my $visible = LJ::Support::can_read_cat( $sp->{_cat}, $remote );
        if ( $type eq 'internal' ) {
            $visible = LJ::Support::can_read_internal( $sp, $remote );
        } elsif ( $type eq 'screened' ) {
            $visible = LJ::Support::can_read_screened( $sp, $remote );
        }
        next unless $visible;

        $match->{url} = "$LJ::SITEROOT/support/see_request?id=" . $spid;
        $match->{type} = $type;
        $match->{spid} = $spid;
        $match->{category} = $sp->{_cat}->{catname};
        $match->{subject} = $sp->{subject};
        $match->{content} = $content;
        push @out, $match;
    }

    # Build the excerpts for the bodies.
    my $exc = $sx->BuildExcerpts( [ map { $_->{content} } @out ], 'dwsupport', $query, {} ) || [];

    # if we have a matching number of excerpts to events, then we can determine
    # which one goes with which post.
    if ( scalar( @out ) == scalar( @$exc ) ) {
        foreach my $m ( @out ) {
            delete $m->{content};
            $m->{excerpt} = shift @$exc;
        }

    } else {
        # something terrible has happened..., user gets no excerpts :(
        foreach my $m ( @out ) {
            delete $m->{content};
            $m->{excerpt} = '(something terrible happened to the excerpts)';
        }
    }

    $res->{matches} = [ grep { exists $_->{excerpt} } @{$res->{matches}} ];
    return $res;
}

sub _build_output {
    my ( $sx, $query, $res, $remoteid ) = @_;

    # try to build some excerpts of these searches, which involves us loading
    # up the exact entry contents...
    if ( $res->{total} > 0 ) {

        # this is weird, I push the hashrefs onto @out from $res->{matches} for
        # convenience only... they're the same hashrefs you know and love
        my @out;

        foreach my $match ( @{ $res->{matches} } ) {
            if ( $match->{jtalkid} == 0 ) {
                my $entry = LJ::Entry->new( $match->{journalid}, jitemid => $match->{jitemid} );
                my $remote = LJ::load_userid( $remoteid );

                # check for validity and for security
                # we filtered by security earlier, but there's a chance it was changed
                # but not yet indexed
                if ( $entry && $entry->valid && $entry->visible_to( $remote ) ) {
                    # use text only version of event for excerpt purposes.  best effort.
                    $match->{entry} = $entry->event_text;
                    $match->{entry} =~ s#<(?:br|p)\s*/?># #gi;
                    $match->{entry} = LJ::strip_html( $match->{entry} );
                    $match->{entry} ||= "(this entry only contains html content)";

                    # we don't munge the subject... just clean it
                    $match->{subject} = $entry->subject_text || '(no subject)';

                    # also useful information that we want for later
                    $match->{url} = $entry->url;
                    $match->{tags} = $entry->tag_map;
                    $match->{security} = $entry->security;
                    $match->{security} = 'access'
                        if $match->{security} eq 'usemask' &&
                            $entry->allowmask == 1;
                    $match->{eventtime} = $entry->eventtime_mysql;

                } else {
                    # something happened, couldn't get the entry
                    $match->{entry} = '(sorry, this entry has been deleted or is otherwise unavailable)';
                    $match->{subject} = 'Entry deleted or unavailable.';
                }
                push @out, $match;
            } elsif ( $match->{jtalkid} > 0 ) {
                my $cmt = LJ::Comment->new( $match->{journalid}, jtalkid => $match->{jtalkid} );
                my $entry = $cmt->entry;
                my $remote = LJ::load_userid( $remoteid );

                # check for validity and for security
                # we filtered by security earlier, but there's a chance it was changed
                # but not yet indexed
                if ( $entry && $entry->valid && $entry->visible_to( $remote ) &&
                        $cmt && $cmt->valid && $cmt->visible_to( $remote ) ) {
                    # use text only version of event for excerpt purposes.  best effort.
                    $match->{entry} = $cmt->body_text;
                    $match->{entry} ||= "(this comment only contains html content)";

                    # we don't munge the subject... just clean it
                    $match->{subject} = $cmt->subject_text || '(no subject)';

                    # also useful information that we want for later
                    $match->{url} = $cmt->url;
                    $match->{security} = $entry->security;
                    $match->{security} = 'access'
                        if $match->{security} eq 'usemask' &&
                            $entry->allowmask == 1;
                    $match->{eventtime} = $cmt->{datepost};

                } else {
                    # something happened, couldn't get the comment
                    $match->{entry} = '(sorry, this comment has been deleted or is otherwise unavailable)';
                    $match->{subject} = 'Comment deleted or unavailable.';
                }
                push @out, $match;
            }
        }

        # FIXME: We are using English stemming in this index. We could try to build separate
        # stemmed indices for other languages, if we want.
        my $exc = $sx->BuildExcerpts( [ map { $_->{entry} } @out ], 'dw1delta', $query, {} ) || [];
        my $subj = $sx->BuildExcerpts( [ map { $_->{subject} } @out ], 'dw1delta', $query, {} ) || [];

        # if we have a matching number of excerpts to events, then we can determine
        # which one goes with which post.
        if ( scalar( @out ) == scalar( @$exc ) ) {
            foreach my $m ( @out ) {
                delete $m->{entry};
                $m->{excerpt} = shift @$exc;
                $m->{subject} = shift @$subj;
            }

        } else {
            # something terrible has happened..., user gets no excerpts :(
            foreach my $m ( @out ) {
                delete $m->{entry};
                $m->{excerpt} = '(something terrible happened to the excerpts)';
            }
        }
    }

    return $res;
}

sub sphinx_search {
    my $job = $_[0];

    my $args = Storable::thaw( $job->arg ) || {};
    return undef unless $args->{query};

    my $sx = Sphinx::Search->new();
    my $search_results = _run_search( $sx, $args );
    return undef unless $search_results;

    my $res = $args->{support} ?
        _build_output_support( $sx, $args->{query}, $search_results, $args->{remoteid} ) :
        _build_output( $sx, $args->{query}, $search_results, $args->{remoteid} );
    return Storable::nfreeze( $res );
}

