# This code was forked from the LiveJournal project owned and operated # by Live Journal, Inc. The code has been modified and expanded by # Dreamwidth Studios, LLC. These files were originally licensed under # the terms of the license supplied by Live Journal, Inc, which can # currently be found at: # # http://code.livejournal.org/trac/livejournal/browser/trunk/LICENSE-LiveJournal.txt # # In accordance with the original license, this code and all its # modifications are provided under the GNU General Public License. # A copy of that license can be found in the LICENSE file included as # part of this distribution. package LJ::SynSuck; use strict; use HTTP::Status; use Log::Log4perl; my $log = Log::Log4perl->get_logger(__PACKAGE__); use LJ::Utils qw(md5_struct); use LJ::Protocol; use LJ::ParseFeed; use LJ::CleanHTML; use DW::FeedCanonicalizer; sub update_feed { my ($urow) = @_; return unless $urow; my ( $user, $userid, $synurl, $lastmod, $etag, $readers ) = map { $urow->{$_} } qw(user userid synurl lastmod etag numreaders); # we can't deal with non-visible journals. try again in a couple # hours. maybe they were unsuspended or whatever. my $su = LJ::load_userid($userid); return delay( $userid, 120, "non_statusvis_v" ) unless $su->is_visible; # we're a child process now, need to invalidate caches and # get a new database handle LJ::start_request(); my $resp = get_content($urow) or return 0; return process_content( $urow, $resp ); } sub _backoff_multiplier { my ($failcount) = @_; return 2**( $failcount > 7 ? 7 : $failcount ); } sub delay { my ( $userid, $minutes, $status, $synurl, $opts ) = @_; $opts //= {}; my $backoff = $opts->{backoff} // 'escalate'; my $token = defined $synurl ? DW::FeedCanonicalizer::canonicalize($synurl) : undef; my $dbh = LJ::get_db_writer(); my $failcount = $dbh->selectrow_array( "SELECT failcount FROM syndicated WHERE userid=?", undef, $userid ) || 0; if ( $backoff eq 'reset' ) { $failcount = 0; } elsif ( $backoff eq 'escalate' ) { $failcount++; } # 'hold' leaves failcount unchanged # apply exponential backoff on escalate/hold (if failcount > 0) if ($failcount) { $minutes = $minutes * _backoff_multiplier($failcount); # cap at 30 days my $max_minutes = 30 * 24 * 60; $minutes = $max_minutes if $minutes > $max_minutes; } # add some random backoff to avoid waves building up $minutes += int( rand(5) ); $log->info( "userid=$userid: status=$status backoff=$backoff failcount=$failcount delay=${minutes}m"); $dbh->do( "UPDATE syndicated SET lastcheck=NOW(), checknext=DATE_ADD(NOW(), " . "INTERVAL ? MINUTE), laststatus=?, failcount=?, " . "fuzzy_token = COALESCE(?,fuzzy_token) WHERE userid=?", undef, $minutes, $status, $failcount, $token, $userid ); return undef; } sub max_size { my ($u) = @_; # optional user object for feed my $max_size = $LJ::SYNSUCK_MAX_SIZE || 3000; # in kb if ( $u && $u->has_priv( "siteadmin", "largefeedsize" ) ) { $max_size = $LJ::SYNSUCK_LARGE_MAX_SIZE || 6000; # in kb } return 1024 * $max_size; # in bytes } sub get_content { my ($urow) = @_; my ( $user, $userid, $synurl, $lastmod, $etag, $readers ) = map { $urow->{$_} } qw(user userid synurl lastmod etag numreaders); my $dbh = LJ::get_db_writer(); # see if things have changed since we last looked and acquired the lock. # otherwise we could 1) check work, 2) get lock, and between 1 and 2 another # process could do both steps. we don't want to duplicate work already done. my $now_checknext = $dbh->selectrow_array( "SELECT checknext FROM syndicated " . "WHERE userid=?", undef, $userid ); return if $now_checknext ne $urow->{checknext}; my $ua = LJ::get_useragent( role => 'syn_sucker' ); my $reader_info = $readers ? "; $readers readers" : ""; $ua->agent( "$LJ::SITENAME ($LJ::ADMIN_EMAIL; for $LJ::SITEROOT/users/$user/" . $reader_info . ")" ); $log->info("Synsuck: $user ($synurl)"); my $req = HTTP::Request->new( "GET", $synurl ); my $can_accept = HTTP::Message::decodable; $req->header( 'Accept-Encoding', $can_accept ); $req->header( 'If-Modified-Since', LJ::time_to_http($lastmod) ) if $lastmod; $req->header( 'If-None-Match', $etag ) if $etag; my ( $content, $too_big ); my $syn_u = LJ::load_user($user); my $max_size = max_size($syn_u); my $res = eval { $ua->request( $req, sub { if ( length($content) > $max_size ) { $too_big = 1; return; } $content .= $_[0]; }, 4096 ); }; if ($@) { return delay( $userid, 120, "lwp_death" ); } if ($too_big) { return delay( $userid, 60, "toobig" ); } # Since we are treating content specially above, we have to recreate # the HTTP::Message with it to get the decoded content. my $message = HTTP::Message->new( $res->headers, $content ); $content = $message->decoded_content( charset => 'none' ); if ( $res->is_error() ) { # http error $log->warn( "HTTP error for $user: " . $res->status_line() ); # overload parseerror here because it's already there -- we'll # never have both an http error and a parse error on the # same request $syn_u->set_prop( "rssparseerror", $res->status_line() ) if $syn_u; delay( $userid, 3 * 60, "parseerror" ); return; } # check if not modified if ( $res->code() == RC_NOT_MODIFIED ) { $log->debug("$user: not modified"); return delay( $userid, $readers ? 60 : 24 * 60, "notmodified", $synurl, { backoff => 'reset' } ); } return [ $res, $content ]; } # helper function which takes feed XML # and returns a list of $num items from the feed # in proper order sub parse_items_from_feed { my ( $content, $num ) = @_; $num ||= 20; return ( 0, { type => "noitems" } ) unless defined $content; # WARNING: blatant XML spec violation ahead... # # Blogger doesn't produce valid XML, since they don't handle encodings # correctly. So if we see they have no encoding (which is UTF-8 implictly) # but it's not valid UTF-8, say it's Windows-1252, which won't # cause XML::Parser to barf... but there will probably be some bogus characters. # better than nothing I guess. (personally, I'd prefer to leave it broken # and have people bitch at Blogger, but jwz wouldn't stop bugging me) # XML::Parser doesn't include Windows-1252, but we put it in cgi-bin/XML/* for it # to find. my $encoding; if ( $content =~ /(<\?xml.+?>)/ && $1 =~ /encoding=([\"\'])(.+?)\1/ ) { $encoding = lc($2); } if ( !$encoding && !LJ::is_utf8($content) ) { $content =~ s/\?>/ encoding='windows-1252' \?>/; } # WARNING: another hack... # People produce what they think is iso-8859-1, but they include # Windows-style smart quotes. Check for invalid iso-8859-1 and correct. if ( $encoding =~ /^iso-8859-1$/i && $content =~ /[\x80-\x9F]/ ) { # They claimed they were iso-8859-1, but they are lying. # Assume it was Windows-1252. $log->debug("Invalid ISO-8859-1; assuming Windows-1252"); $content =~ s/encoding=([\"\'])(.+?)\1/encoding='windows-1252'/; } # ANOTHER hack: if a feed asks for ANSI_v3.4-1968 (ASCII), alias it to us-ascii if ( $encoding =~ /^ANSI_X3.4-1968$/i ) { $content =~ s/encoding=([\"\'])(.+?)\1/encoding='us-ascii'/; } # and yet another hack, this time to alias 'ascii' to 'us-ascii' if ( $encoding =~ /^ascii$/i ) { $content =~ s/encoding=([\"\'])(.+?)\1/encoding='us-ascii'/; } # parsing time... my ( $feed, $error ) = LJ::ParseFeed::parse_feed($content); return ( 0, { type => "parseerror", message => $error } ) if $error; # another sanity check return ( 0, { type => "noitems" } ) unless ref $feed->{items} eq "ARRAY"; my @items = reverse @{ $feed->{items} } or return ( 0, { type => "noitems" } ); # If the feed appears to be datestamped, resort chronologically, # from earliest to latest - oldest entries are posted first, below. my $timesort = sub { LJ::mysqldate_to_time( $_[0]->{time} ) }; @items = sort { $timesort->($a) <=> $timesort->($b) } @items if $items[0]->{time}; # take most recent 20 splice( @items, 0, @items - $num ) if @items > $num; return ( 1, { items => \@items, feed => $feed } ); } sub process_content { my ( $urow, $resp ) = @_; my ( $res, $content ) = @$resp; my ( $user, $userid, $synurl, $lastmod, $etag, $readers, $fuzzy_token ) = map { $urow->{$_} } qw(user userid synurl lastmod etag numreaders fuzzy_token); my $dbh = LJ::get_db_writer(); my ( $ok, $rv ) = parse_items_from_feed( $content, 20 ); unless ($ok) { if ( $rv->{type} eq "parseerror" ) { # parse error! if ( my $error = $rv->{message} ) { $log->warn("$user: parse error: $error"); $error =~ s! at /.*!!; $error =~ s/^\n//; # cleanup of newline at the beginning of the line my $syn_u = LJ::load_user($user); $syn_u->set_prop( "rssparseerror", $error ) if $syn_u; } delay( $userid, 3 * 60, "parseerror", $synurl ); return; } elsif ( $rv->{type} eq "noitems" ) { return delay( $userid, 3 * 60, "noitems", $synurl ); } else { $log->warn("$user: unknown error type"); return delay( $userid, 3 * 60, "unknown" ); } } my $feed = $rv->{feed}; # Eval'd so this failing for some reason doesn't break # the feed my $final_url = eval { return $res->request->uri; }; $feed->{final_url} = $final_url->as_string if $final_url; $fuzzy_token = DW::FeedCanonicalizer::canonicalize( $synurl, $feed ); my @items = @{ $rv->{items} }; # delete existing items older than the age which can show on a # friends view. my $su = LJ::load_userid($userid); my $udbh = LJ::get_cluster_master($su); unless ($udbh) { return delay( $userid, 15, "nodb", undef, { backoff => 'hold' } ); } # TAG:LOG2:synsuck_delete_olderitems my $secs = ( $LJ::MAX_FRIENDS_VIEW_AGE || 3600 * 24 * 14 ) + 0; # 2 week default. my $sth = $udbh->prepare( "SELECT jitemid, anum FROM log2 WHERE journalid=? AND " . "logtime < DATE_SUB(NOW(), INTERVAL $secs SECOND)" ); $sth->execute($userid); die $udbh->errstr if $udbh->err; while ( my ( $jitemid, $anum ) = $sth->fetchrow_array ) { if ( LJ::delete_entry( $su, $jitemid, 0, $anum ) ) { $log->debug("$user: deleted itemid=$jitemid anum=$anum"); } else { $log->warn("$user: failed to delete itemid=$jitemid anum=$anum"); } } # determine if link tags are good or not, where good means # "likely to be a unique per item". some feeds have the same # element for each item, which isn't good. # if we have unique ids, we don't compare link tags my ( $compare_links, $have_ids ) = 0; { my %link_seen; foreach my $it (@items) { $have_ids = 1 if $it->{'id'}; next unless $it->{'link'}; $link_seen{ $it->{'link'} } = 1; } $compare_links = 1 if !$have_ids and $feed->{'type'} eq 'rss' and scalar( keys %link_seen ) == scalar(@items); } # if we have unique links/ids, load them for syndicated # items we already have on the server. then, if we have one # already later and see it's changed, we'll do an editevent # instead of a new post. my %existing_item = (); if ( $have_ids || $compare_links ) { my $p = $have_ids ? LJ::get_prop( "log", "syn_id" ) : LJ::get_prop( "log", "syn_link" ); my $sth = $udbh->prepare( "SELECT jitemid, value FROM logprop2 WHERE " . "journalid=? AND propid=? LIMIT 1000" ); $sth->execute( $su->{'userid'}, $p->{'id'} ); while ( my ( $itemid, $id ) = $sth->fetchrow_array ) { $existing_item{$id} = $itemid; } } # post these items my $itemcount = scalar @items; my $newfeed = !$su->timeupdate; # true if never updated before my $newcount = 0; my $errorflag = 0; my $mindate; # "yyyy-mm-dd hh:mm:ss"; my $notedate = sub { my $date = shift; $mindate = $date if !$mindate || $date lt $mindate; }; foreach my $it (@items) { # remove the SvUTF8 flag. it's still UTF-8, but # we don't want perl knowing that and messing stuff up # for us behind our back in random places all over # http://zilla.livejournal.org/show_bug.cgi?id=1037 foreach my $attr (qw(id subject text link author)) { next unless exists $it->{$attr} && defined $it->{$attr}; $it->{$attr} = LJ::no_utf8_flag( $it->{$attr} ); } # duplicate entry detection my $dig = LJ::md5_struct($it)->b64digest; my $prevadd = $dbh->selectrow_array( "SELECT MAX(dateadd) FROM synitem WHERE " . "userid=? AND item=?", undef, $userid, $dig ); if ($prevadd) { $notedate->($prevadd); $itemcount--; next; } my $now_dateadd = $dbh->selectrow_array("SELECT NOW()"); die "unexpected format" unless $now_dateadd =~ /^\d\d\d\d\-\d\d\-\d\d \d\d:\d\d:\d\d$/; $dbh->do( "INSERT INTO synitem (userid, item, dateadd) VALUES (?,?,?)", undef, $userid, $dig, $now_dateadd ); $notedate->($now_dateadd); $log->debug("$user: $dig - $it->{'subject'}"); $it->{'text'} =~ s/^\s+//; $it->{'text'} =~ s/\s+$//; my $author = ""; if ( defined $it->{author} ) { $author = "
"; } my $htmllink; if ( defined $it->{'link'} ) { $htmllink = ""; } # Show the