# This code was forked from the LiveJournal project owned and operated # by Live Journal, Inc. The code has been modified and expanded by # Dreamwidth Studios, LLC. These files were originally licensed under # the terms of the license supplied by Live Journal, Inc, which can # currently be found at: # # http://code.livejournal.org/trac/livejournal/browser/trunk/LICENSE-LiveJournal.txt # # In accordance with the original license, this code and all its # modifications are provided under the GNU General Public License. # A copy of that license can be found in the LICENSE file included as # part of this distribution. package LJ::SynSuck; use strict; use HTTP::Status; use Log::Log4perl; my $log = Log::Log4perl->get_logger(__PACKAGE__); use LJ::Utils qw(md5_struct); use LJ::Protocol; use LJ::ParseFeed; use LJ::CleanHTML; use DW::FeedCanonicalizer; sub update_feed { my ($urow) = @_; return unless $urow; my ( $user, $userid, $synurl, $lastmod, $etag, $readers ) = map { $urow->{$_} } qw(user userid synurl lastmod etag numreaders); # we can't deal with non-visible journals. try again in a couple # hours. maybe they were unsuspended or whatever. my $su = LJ::load_userid($userid); return delay( $userid, 120, "non_statusvis_v" ) unless $su->is_visible; # we're a child process now, need to invalidate caches and # get a new database handle LJ::start_request(); my $resp = get_content($urow) or return 0; return process_content( $urow, $resp ); } sub _backoff_multiplier { my ($failcount) = @_; return 2**( $failcount > 7 ? 7 : $failcount ); } sub delay { my ( $userid, $minutes, $status, $synurl, $opts ) = @_; $opts //= {}; my $backoff = $opts->{backoff} // 'escalate'; my $token = defined $synurl ? DW::FeedCanonicalizer::canonicalize($synurl) : undef; my $dbh = LJ::get_db_writer(); my $failcount = $dbh->selectrow_array( "SELECT failcount FROM syndicated WHERE userid=?", undef, $userid ) || 0; if ( $backoff eq 'reset' ) { $failcount = 0; } elsif ( $backoff eq 'escalate' ) { $failcount++; } # 'hold' leaves failcount unchanged # apply exponential backoff on escalate/hold (if failcount > 0) if ($failcount) { $minutes = $minutes * _backoff_multiplier($failcount); # cap at 30 days my $max_minutes = 30 * 24 * 60; $minutes = $max_minutes if $minutes > $max_minutes; } # add some random backoff to avoid waves building up $minutes += int( rand(5) ); $log->info( "userid=$userid: status=$status backoff=$backoff failcount=$failcount delay=${minutes}m"); $dbh->do( "UPDATE syndicated SET lastcheck=NOW(), checknext=DATE_ADD(NOW(), " . "INTERVAL ? MINUTE), laststatus=?, failcount=?, " . "fuzzy_token = COALESCE(?,fuzzy_token) WHERE userid=?", undef, $minutes, $status, $failcount, $token, $userid ); return undef; } sub max_size { my ($u) = @_; # optional user object for feed my $max_size = $LJ::SYNSUCK_MAX_SIZE || 3000; # in kb if ( $u && $u->has_priv( "siteadmin", "largefeedsize" ) ) { $max_size = $LJ::SYNSUCK_LARGE_MAX_SIZE || 6000; # in kb } return 1024 * $max_size; # in bytes } sub get_content { my ($urow) = @_; my ( $user, $userid, $synurl, $lastmod, $etag, $readers ) = map { $urow->{$_} } qw(user userid synurl lastmod etag numreaders); my $dbh = LJ::get_db_writer(); # see if things have changed since we last looked and acquired the lock. # otherwise we could 1) check work, 2) get lock, and between 1 and 2 another # process could do both steps. we don't want to duplicate work already done. my $now_checknext = $dbh->selectrow_array( "SELECT checknext FROM syndicated " . "WHERE userid=?", undef, $userid ); return if $now_checknext ne $urow->{checknext}; my $ua = LJ::get_useragent( role => 'syn_sucker' ); my $reader_info = $readers ? "; $readers readers" : ""; $ua->agent( "$LJ::SITENAME ($LJ::ADMIN_EMAIL; for $LJ::SITEROOT/users/$user/" . $reader_info . ")" ); $log->info("Synsuck: $user ($synurl)"); my $req = HTTP::Request->new( "GET", $synurl ); my $can_accept = HTTP::Message::decodable; $req->header( 'Accept-Encoding', $can_accept ); $req->header( 'If-Modified-Since', LJ::time_to_http($lastmod) ) if $lastmod; $req->header( 'If-None-Match', $etag ) if $etag; my ( $content, $too_big ); my $syn_u = LJ::load_user($user); my $max_size = max_size($syn_u); my $res = eval { $ua->request( $req, sub { if ( length($content) > $max_size ) { $too_big = 1; return; } $content .= $_[0]; }, 4096 ); }; if ($@) { return delay( $userid, 120, "lwp_death" ); } if ($too_big) { return delay( $userid, 60, "toobig" ); } # Since we are treating content specially above, we have to recreate # the HTTP::Message with it to get the decoded content. my $message = HTTP::Message->new( $res->headers, $content ); $content = $message->decoded_content( charset => 'none' ); if ( $res->is_error() ) { # http error $log->warn( "HTTP error for $user: " . $res->status_line() ); # overload parseerror here because it's already there -- we'll # never have both an http error and a parse error on the # same request $syn_u->set_prop( "rssparseerror", $res->status_line() ) if $syn_u; delay( $userid, 3 * 60, "parseerror" ); return; } # check if not modified if ( $res->code() == RC_NOT_MODIFIED ) { $log->debug("$user: not modified"); return delay( $userid, $readers ? 60 : 24 * 60, "notmodified", $synurl, { backoff => 'reset' } ); } return [ $res, $content ]; } # helper function which takes feed XML # and returns a list of $num items from the feed # in proper order sub parse_items_from_feed { my ( $content, $num ) = @_; $num ||= 20; return ( 0, { type => "noitems" } ) unless defined $content; # WARNING: blatant XML spec violation ahead... # # Blogger doesn't produce valid XML, since they don't handle encodings # correctly. So if we see they have no encoding (which is UTF-8 implictly) # but it's not valid UTF-8, say it's Windows-1252, which won't # cause XML::Parser to barf... but there will probably be some bogus characters. # better than nothing I guess. (personally, I'd prefer to leave it broken # and have people bitch at Blogger, but jwz wouldn't stop bugging me) # XML::Parser doesn't include Windows-1252, but we put it in cgi-bin/XML/* for it # to find. my $encoding; if ( $content =~ /(<\?xml.+?>)/ && $1 =~ /encoding=([\"\'])(.+?)\1/ ) { $encoding = lc($2); } if ( !$encoding && !LJ::is_utf8($content) ) { $content =~ s/\?>/ encoding='windows-1252' \?>/; } # WARNING: another hack... # People produce what they think is iso-8859-1, but they include # Windows-style smart quotes. Check for invalid iso-8859-1 and correct. if ( $encoding =~ /^iso-8859-1$/i && $content =~ /[\x80-\x9F]/ ) { # They claimed they were iso-8859-1, but they are lying. # Assume it was Windows-1252. $log->debug("Invalid ISO-8859-1; assuming Windows-1252"); $content =~ s/encoding=([\"\'])(.+?)\1/encoding='windows-1252'/; } # ANOTHER hack: if a feed asks for ANSI_v3.4-1968 (ASCII), alias it to us-ascii if ( $encoding =~ /^ANSI_X3.4-1968$/i ) { $content =~ s/encoding=([\"\'])(.+?)\1/encoding='us-ascii'/; } # and yet another hack, this time to alias 'ascii' to 'us-ascii' if ( $encoding =~ /^ascii$/i ) { $content =~ s/encoding=([\"\'])(.+?)\1/encoding='us-ascii'/; } # parsing time... my ( $feed, $error ) = LJ::ParseFeed::parse_feed($content); return ( 0, { type => "parseerror", message => $error } ) if $error; # another sanity check return ( 0, { type => "noitems" } ) unless ref $feed->{items} eq "ARRAY"; my @items = reverse @{ $feed->{items} } or return ( 0, { type => "noitems" } ); # If the feed appears to be datestamped, resort chronologically, # from earliest to latest - oldest entries are posted first, below. my $timesort = sub { LJ::mysqldate_to_time( $_[0]->{time} ) }; @items = sort { $timesort->($a) <=> $timesort->($b) } @items if $items[0]->{time}; # take most recent 20 splice( @items, 0, @items - $num ) if @items > $num; return ( 1, { items => \@items, feed => $feed } ); } sub process_content { my ( $urow, $resp ) = @_; my ( $res, $content ) = @$resp; my ( $user, $userid, $synurl, $lastmod, $etag, $readers, $fuzzy_token ) = map { $urow->{$_} } qw(user userid synurl lastmod etag numreaders fuzzy_token); my $dbh = LJ::get_db_writer(); my ( $ok, $rv ) = parse_items_from_feed( $content, 20 ); unless ($ok) { if ( $rv->{type} eq "parseerror" ) { # parse error! if ( my $error = $rv->{message} ) { $log->warn("$user: parse error: $error"); $error =~ s! at /.*!!; $error =~ s/^\n//; # cleanup of newline at the beginning of the line my $syn_u = LJ::load_user($user); $syn_u->set_prop( "rssparseerror", $error ) if $syn_u; } delay( $userid, 3 * 60, "parseerror", $synurl ); return; } elsif ( $rv->{type} eq "noitems" ) { return delay( $userid, 3 * 60, "noitems", $synurl ); } else { $log->warn("$user: unknown error type"); return delay( $userid, 3 * 60, "unknown" ); } } my $feed = $rv->{feed}; # Eval'd so this failing for some reason doesn't break # the feed my $final_url = eval { return $res->request->uri; }; $feed->{final_url} = $final_url->as_string if $final_url; $fuzzy_token = DW::FeedCanonicalizer::canonicalize( $synurl, $feed ); my @items = @{ $rv->{items} }; # delete existing items older than the age which can show on a # friends view. my $su = LJ::load_userid($userid); my $udbh = LJ::get_cluster_master($su); unless ($udbh) { return delay( $userid, 15, "nodb", undef, { backoff => 'hold' } ); } # TAG:LOG2:synsuck_delete_olderitems my $secs = ( $LJ::MAX_FRIENDS_VIEW_AGE || 3600 * 24 * 14 ) + 0; # 2 week default. my $sth = $udbh->prepare( "SELECT jitemid, anum FROM log2 WHERE journalid=? AND " . "logtime < DATE_SUB(NOW(), INTERVAL $secs SECOND)" ); $sth->execute($userid); die $udbh->errstr if $udbh->err; while ( my ( $jitemid, $anum ) = $sth->fetchrow_array ) { if ( LJ::delete_entry( $su, $jitemid, 0, $anum ) ) { $log->debug("$user: deleted itemid=$jitemid anum=$anum"); } else { $log->warn("$user: failed to delete itemid=$jitemid anum=$anum"); } } # determine if link tags are good or not, where good means # "likely to be a unique per item". some feeds have the same # element for each item, which isn't good. # if we have unique ids, we don't compare link tags my ( $compare_links, $have_ids ) = 0; { my %link_seen; foreach my $it (@items) { $have_ids = 1 if $it->{'id'}; next unless $it->{'link'}; $link_seen{ $it->{'link'} } = 1; } $compare_links = 1 if !$have_ids and $feed->{'type'} eq 'rss' and scalar( keys %link_seen ) == scalar(@items); } # if we have unique links/ids, load them for syndicated # items we already have on the server. then, if we have one # already later and see it's changed, we'll do an editevent # instead of a new post. my %existing_item = (); if ( $have_ids || $compare_links ) { my $p = $have_ids ? LJ::get_prop( "log", "syn_id" ) : LJ::get_prop( "log", "syn_link" ); my $sth = $udbh->prepare( "SELECT jitemid, value FROM logprop2 WHERE " . "journalid=? AND propid=? LIMIT 1000" ); $sth->execute( $su->{'userid'}, $p->{'id'} ); while ( my ( $itemid, $id ) = $sth->fetchrow_array ) { $existing_item{$id} = $itemid; } } # post these items my $itemcount = scalar @items; my $newfeed = !$su->timeupdate; # true if never updated before my $newcount = 0; my $errorflag = 0; my $mindate; # "yyyy-mm-dd hh:mm:ss"; my $notedate = sub { my $date = shift; $mindate = $date if !$mindate || $date lt $mindate; }; foreach my $it (@items) { # remove the SvUTF8 flag. it's still UTF-8, but # we don't want perl knowing that and messing stuff up # for us behind our back in random places all over # http://zilla.livejournal.org/show_bug.cgi?id=1037 foreach my $attr (qw(id subject text link author)) { next unless exists $it->{$attr} && defined $it->{$attr}; $it->{$attr} = LJ::no_utf8_flag( $it->{$attr} ); } # duplicate entry detection my $dig = LJ::md5_struct($it)->b64digest; my $prevadd = $dbh->selectrow_array( "SELECT MAX(dateadd) FROM synitem WHERE " . "userid=? AND item=?", undef, $userid, $dig ); if ($prevadd) { $notedate->($prevadd); $itemcount--; next; } my $now_dateadd = $dbh->selectrow_array("SELECT NOW()"); die "unexpected format" unless $now_dateadd =~ /^\d\d\d\d\-\d\d\-\d\d \d\d:\d\d:\d\d$/; $dbh->do( "INSERT INTO synitem (userid, item, dateadd) VALUES (?,?,?)", undef, $userid, $dig, $now_dateadd ); $notedate->($now_dateadd); $log->debug("$user: $dig - $it->{'subject'}"); $it->{'text'} =~ s/^\s+//; $it->{'text'} =~ s/\s+$//; my $author = ""; if ( defined $it->{author} ) { $author = "

Posted by " . LJ::ehtml( $it->{author} ) . "

"; } my $htmllink; if ( defined $it->{'link'} ) { $htmllink = "

" . "{'link'}\">$it->{'link'}

"; } # Show the link if it's present and different than the # . # [zilla: 267] Patch: Chaz Meyers if ( defined $it->{'id'} && $it->{'id'} ne $it->{'link'} && $it->{'id'} =~ m!^https?://! ) { $htmllink .= "

" . "{'id'}\">$it->{'id'}

"; } # rewrite relative URLs to absolute URLs, but only invoke the HTML parser # if we see there's some image or link tag, to save us some work if it's # unnecessary (the common case) if ( $it->{'text'} =~ /<(?:img|a)\b/i ) { # TODO: support XML Base? http://www.w3.org/TR/xmlbase/ my $base_href = $it->{'link'} || $synurl; LJ::CleanHTML::resolve_relative_urls( \$it->{'text'}, $base_href ); } # $own_time==1 means we took the time from the feed rather than localtime my ( $own_time, $year, $mon, $day, $hour, $min ); if ( $it->{'time'} && $it->{'time'} =~ m!^(\d\d\d\d)-(\d\d)-(\d\d) (\d\d):(\d\d)! ) { $own_time = 1; ( $year, $mon, $day, $hour, $min ) = ( $1, $2, $3, $4, $5 ); } else { $own_time = 0; my @now = localtime(); ( $year, $mon, $day, $hour, $min ) = ( $now[5] + 1900, $now[4] + 1, $now[3], $now[2], $now[1] ); } # just bail on entries older than two weeks instead of reposting them if ($own_time) { my $age = time() - LJ::mysqldate_to_time( $it->{'time'} ); if ( $age > $secs ) { # $secs is defined waaaaaaaay above $itemcount--; next; } } $newcount++; # we're committed to posting this item now my $command = "postevent"; my $req = { 'username' => $user, 'ver' => 1, 'subject' => $it->{'subject'}, 'event' => "$author$htmllink$it->{'text'}$htmllink", 'year' => $year, 'mon' => $mon, 'day' => $day, 'hour' => $hour, 'min' => $min, 'props' => { 'syn_link' => $it->{'link'}, }, }; $req->{'props'}->{'syn_id'} = $it->{'id'} if $it->{'id'}; my $flags = { 'nopassword' => 1, 'allow_truncated_subject' => 1, }; # if the post contains html linebreaks, assume it's preformatted. if ( $it->{'text'} =~ /<(?:p|br)\b/i ) { $req->{'props'}->{'opt_preformatted'} = 1; } # If this is a new feed, backdate all but last three items. # Note this is a best effort; might not print all three entries # if duplicate entries are detected later in the feed. $req->{props}->{opt_backdated} = 1 if $newfeed && ( $itemcount - $newcount ) >= 3; # do an editevent if we've seen this item before my $id = $have_ids ? $it->{'id'} : $it->{'link'}; my $old_itemid = $existing_item{$id}; if ( $id && $old_itemid ) { $newcount--; # cancel increment above $command = "editevent"; $req->{'itemid'} = $old_itemid; # the editevent requires us to resend the date info, which # we have to go fetch first, in case the feed doesn't have it # TAG:LOG2:synsuck_fetch_itemdates unless ($own_time) { my $origtime = $udbh->selectrow_array( "SELECT eventtime FROM log2 WHERE " . "journalid=? AND jitemid=?", undef, $su->{'userid'}, $old_itemid ); $origtime =~ /(\d\d\d\d)-(\d\d)-(\d\d) (\d\d):(\d\d)/; $req->{'year'} = $1; $req->{'mon'} = $2; $req->{'day'} = $3; $req->{'hour'} = $4; $req->{'min'} = $5; } } my $err; my $pres = LJ::Protocol::do_request( $command, $req, \$err, $flags ); unless ( $pres && !$err ) { $log->error("$user: $err"); $errorflag = 1; } } # delete some unneeded synitems. the limit 1000 is because # historically we never deleted and there are accounts with # 222,000 items on a myisam table, and that'd be quite the # delete hit. # the 14 day interval is because if a remote site deleted an # entry, it's possible for the oldest item that was previously # gone to reappear, and we want to protect against that a # little. if ($mindate) { $dbh->do( "DELETE FROM synitem WHERE userid=? AND " . "dateadd < ? - INTERVAL 14 DAY LIMIT 1000", undef, $userid, $mindate ); } $dbh->do( "UPDATE syndicated SET oldest_ourdate=? WHERE userid=?", undef, $mindate, $userid ); # bail out if errors, and try again shortly if ($errorflag) { delay( $userid, 30, "posterror", undef, { backoff => 'hold' } ); return; } # update syndicated account's profile if necessary $su->preload_props( "url", "urlname" ); { my $title = $feed->{'title'}; $title = $su->{'user'} unless LJ::is_utf8($title); if ( defined $title && $title ne $su->{'name'} ) { $title =~ s/[\n\r]//g; $su->update_self( { name => $title } ); $su->set_prop( "urlname", $title ); } my $link = $feed->{'link'}; if ( $link && $link ne $su->{'url'} ) { $su->set_prop( "url", $link ); } my $bio = $su->bio; $su->set_bio( $feed->{'description'} ) unless $bio && $bio =~ /\[LJ:KEEP\]/; } my $r_lastmod = LJ::http_to_time( $res->header('Last-Modified') ); my $r_etag = $res->header('ETag'); # decide when to poll next (in minutes). # FIXME: this is super bad. (use hints in RSS file!) my $int = $newcount ? 30 : 60; my $status = $newcount ? "ok" : "nonew"; my $updatenew = $newcount ? ", lastnew=NOW()" : ""; # update reader count while we're changing things, but not # if feed is stale (minimize DB work for inactive things) if ( $newcount || !defined $readers ) { $readers = $su->watched_by_userids; } # if readers are gone, don't check for a whole day $int = 60 * 24 unless $readers; $log->info("userid=$userid: status=$status failcount=0 (reset) delay=${int}m"); $dbh->do( "UPDATE syndicated SET fuzzy_token=?, checknext=DATE_ADD(NOW(), INTERVAL ? MINUTE), " . "lastcheck=NOW(), lastmod=?, etag=?, laststatus=?, numreaders=?, failcount=0 $updatenew " . "WHERE userid=?", undef, $fuzzy_token, $int, $r_lastmod, $r_etag, $status, $readers, $userid ) or die $dbh->errstr; return 1; } 1;