#!/usr/bin/perl # # DW::Worker::ContentImporter::LiveJournal::Entries # # Importer worker for LiveJournal-based sites entries. # # Authors: # Andrea Nall # Mark Smith # # Copyright (c) 2009-2011 by Dreamwidth Studios, LLC. # # This program is free software; you may redistribute it and/or modify it under # the same terms as Perl itself. For a copy of the license, please reference # 'perldoc perlartistic' or 'perldoc perlgpl'. # package DW::Worker::ContentImporter::LiveJournal::Entries; use strict; use base 'DW::Worker::ContentImporter::LiveJournal'; use Carp qw/ croak confess /; use Time::HiRes qw/ tv_interval gettimeofday /; use DW::Task::SphinxCopier; use DW::Worker::ContentImporter::Local::Entries; sub work { # VITALLY IMPORTANT THAT THIS IS CLEARED BETWEEN JOBS %DW::Worker::ContentImporter::LiveJournal::MAPS = (); LJ::start_request(); my ( $class, $job ) = @_; my $opts = $job->arg; my $data = $class->import_data( $opts->{userid}, $opts->{import_data_id} ); return $class->decline($job) unless $class->enabled($data); eval { try_work( $class, $job, $opts, $data ); }; if ( my $msg = $@ ) { $msg =~ s/\r?\n/ /gs; return $class->temp_fail( $data, 'lj_entries', $job, 'Failure running job: %s', $msg ); } # FIXME: temporary hack to reclaim memory when we have imported entries exit 0; } sub try_work { my ( $class, $job, $opts, $data ) = @_; my $begin_time = [ gettimeofday() ]; # we know that we can potentially take a while, so budget some hours for # the import job before someone else comes in to snag it $job->grabbed_until( time() + 3600 * 12 ); $job->save; # failure wrappers for convenience my $fail = sub { return $class->fail( $data, 'lj_entries', $job, @_ ); }; my $ok = sub { return $class->ok( $data, 'lj_entries', $job ); }; my $temp_fail = sub { return $class->temp_fail( $data, 'lj_entries', $job, @_ ); }; my $status = sub { return $class->status( $data, 'lj_entries', {@_} ); }; # logging sub my ( $logfile, $last_log_time ); $logfile = $class->start_log( "lj_entries", userid => $opts->{userid}, import_data_id => $opts->{import_data_id} ) or return $temp_fail->('Internal server error creating log.'); my $log = sub { $last_log_time ||= [ gettimeofday() ]; my $fmt = "[%0.4fs %0.1fs] " . shift() . "\n"; my $msg = sprintf( $fmt, tv_interval($last_log_time), tv_interval($begin_time), @_ ); print $logfile $msg; $job->debug($msg); $last_log_time = [ gettimeofday() ]; }; # setup my $u = LJ::load_userid( $data->{userid} ) or return $fail->( 'Unable to load target with id %d.', $data->{userid} ); $log->( 'Import begun for %s(%d).', $u->user, $u->userid ); $log->( 'memory usage is now %dMB', LJ::gtop()->proc_mem($$)->resident / 1024 / 1024 ); # title munging my $title = sub { my $msg = sprintf( shift(), @_ ); $msg = " $msg" if $msg; $0 = sprintf( 'content-importer [entries: %s(%d)%s]', $u->user, $u->id, $msg ); }; $title->(); # load entry map my $entry_map = DW::Worker::ContentImporter::Local::Entries->get_entry_map($u) || {}; $log->( 'Loaded entry map with %d entries.', scalar( keys %$entry_map ) ); $log->( 'memory usage is now %dMB', LJ::gtop()->proc_mem($$)->resident / 1024 / 1024 ); # and xpost map my $xpost_map = $class->get_xpost_map( $u, $data ) || {}; $log->( 'Loaded xpost map with %d entries.', scalar( keys %$xpost_map ) ); $log->( 'memory usage is now %dMB', LJ::gtop()->proc_mem($$)->resident / 1024 / 1024 ); # get the itemid of the most recent entry (just so we know how many entries there have # been in the life of this account) $log->('Fetching the most recent entry.'); my $last = $class->call_xmlrpc( $data, 'getevents', { ver => 1, selecttype => 'lastn', howmany => 50, lineendings => 'unix', } ); my $xmlrpc_fail = 'XMLRPC failure: ' . ( $last ? $last->{faultString} : '[unknown]' ); $xmlrpc_fail .= " (community: $data->{usejournal})" if $data->{usejournal}; return $temp_fail->($xmlrpc_fail) if !$last || $last->{fault}; # we weren't able to get any data. Maybe weren't able to connect to remote server? # we want to try again return $temp_fail->('Failed to fetch the most recent entry.') unless ref $last->{events} eq 'ARRAY'; # look for the latest non-DOOO entry # first entry may be a sticky entry (can't tell via props) # so we guess by taking the max non-DOOO entry on this list my $maxid; foreach my $event ( @{ $last->{events} } ) { if ( $event->{props}->{opt_backdated} ) { $log->( 'Skipping %d -- backdated', $event->{itemid} ); } else { $maxid = $event->{itemid} > $maxid ? $event->{itemid} : $maxid; } } $log->( 'Discovered that the maximum jitemid on the remote is %d.', $maxid ) if $maxid; # we got entries but weren't able to find a single *non*-DOOO entry. # No idea what the actual latest entry is. Give up now, tell the user to fix it unless ($maxid) { $log->( 'Got %d entries but all were DOOO.', scalar @{ $last->{events} } ); return $fail->( "We weren't able to find your latest entry because you have several entries that are dated out of order (DOOO, or backdated). " . "Please edit all the entries on %s that are DOOO and set their date to one that's earlier than your latest non-DOOO entry. " . "Then try your import again.", $data->{hostname} ); } my ( $duplicates_map, %has ) = $class->find_prunable_items( $u, $entry_map, $xpost_map, hostname => $data->{hostname}, username => $data->{username}, log_sub => $log, force_reimport => $data->{options}->{lj_entries_remap_icon} ); $title->('post-prune'); # this is a useful helper sub we use my $count = 0; my $process_entry = sub { my $evt = $_[0]; my $user = $data->{usejournal} // $data->{username}; # Key for entry_map. We know the username and the site, so we set this to # something that is dependable. $evt->{key} = $data->{hostname} . '/' . $user . '/' . ( $evt->{itemid} * 256 + $evt->{anum} ); # We also need to calculate the remote URL for ESN purposes. my $ext_u = DW::External::User->new( user => $user, site => $data->{hostname} ); # Although I don't think this is likely to ever croak, # I'm being extra-cautious and using an eval here to # make sure this can't interrupt the import process. $evt->{url} = eval { $ext_u->site->entry_url( $ext_u, %$evt ) }; $count++; $log->( ' %d %s %s; mapped = %d (import_source) || %d (xpost).', $evt->{itemid}, $evt->{url}, $evt->{logtime}, $entry_map->{ $evt->{key} }, $xpost_map->{ $evt->{itemid} } ); # always set the picture_keyword property though, in case they're a paid # user come back to fix their keywords. this forcefully overwrites their # local picture keyword if ( my $jitemid = $entry_map->{ $evt->{key} } ) { my $entry = LJ::Entry->new( $u, jitemid => $jitemid ); my $kw = $evt->{props}->{picture_keyword}; if ( $u->userpic_have_mapid ) { $entry->set_prop( picture_mapid => $u->get_mapid_from_keyword( $kw, create => 1 ) ); } else { $entry->set_prop( picture_keyword => $kw ); } } # now try to skip it if we already have it return if $entry_map->{ $evt->{key} } || $xpost_map->{ $evt->{itemid} } || $has{ $evt->{itemid} }; # try to detect suspiciously similar entries # but first clean subject to make sure it matches what we actually inserted into the db # separate condition from above so that we can log that we guessed these are similar $evt->{subject} = $class->remap_lj_user( $data, $evt->{subject} || "" ); my $dupecheck_key = $evt->{eventtime} . '-' . $evt->{subject}; if ( $duplicates_map->{$dupecheck_key} ) { $log->( "Skipping entry on %s. Remote entry jitemid %d similar to local jitemid %d.", $evt->{eventtime}, $evt->{itemid}, $duplicates_map->{$dupecheck_key} ); $has{ $evt->{itemid} } = 1; return; } # clean up event for LJ and remap friend groups my @item_errors; my $allowmask = $evt->{allowmask}; my $newmask = $class->remap_groupmask( $data, $allowmask ); # if we are unable to determine a good groupmask, then fall back to making # the entry private and mark the error. if ( $allowmask != 1 && $newmask == 1 ) { $newmask = 0; push @item_errors, "Could not determine groups to post to."; } $evt->{allowmask} = $newmask; # now try to determine if we need to post this as a user my $posteru; if ( $data->{usejournal} ) { my ( $posterid, $fid ) = $class->get_remapped_userids( $data, $evt->{poster}, $log ); unless ($posterid) { # FIXME: need a better way of totally dying... push @item_errors, "Unable to map poster from LJ user '$evt->{poster}' to local user."; $status->( remote_url => $evt->{url}, errors => \@item_errors, ); return; } $posteru = LJ::load_userid($posterid); } # we just link polls to the original site # FIXME: this URL should be from some method and not manually constructed my $event = $evt->{event}; $event =~ s!<.+?-poll-(\d+?)>![Poll #$1]!g; if ( $event =~ m/<.+?-embed-.+?>/ ) { $event =~ s/<.+?-embed-.+?>//g; push @item_errors, "Entry contained an embed tag, please manually re-add the embedded content."; } if ( $event =~ m/<.+?-template-.+?>/ ) { $event =~ s/<.+?-template-.+?>//g; push @item_errors, "Entry contained a template tag, please manually re-add the templated content."; } $evt->{event} = $class->remap_lj_user( $data, $event ); # actually post it my ( $ok, $res ) = DW::Worker::ContentImporter::Local::Entries->post_event( $data, $entry_map, $u, $posteru, $evt, \@item_errors ); # we don't need this text anymore, so nuke it to try to save memory delete $evt->{event}; delete $evt->{subject}; # now record any errors that happened $status->( remote_url => $evt->{url}, post_res => $res, errors => \@item_errors, ) if @item_errors; }; # helper to load some events -- if this function does not return a true value, # the caller must exit (errors were encountered). my $fetch_events = sub { # let them know we're still working $job->grabbed_until( time() + 3600 ); $job->save; # clear "request" cache of db handles to force revalidation, whenever fetching a block # of entries, to avoid possibly using one that was idle too long without testing it. $LJ::DBIRole->clear_req_cache(); $log->( 'Fetching %d items.', scalar @_ ); $title->( 'getevents - %d to %d', $_[0], $_[-1] ); # try to get it from the remote server my $hash = $class->call_xmlrpc( $data, 'getevents', { ver => 1, itemids => join( ',', @_ ) . ",", selecttype => 'multiple', lineendings => 'unix', } ); # if we get an error, then we have to abort the import my $xmlrpc_fail = 'XMLRPC failure: ' . ( $hash ? $hash->{faultString} : '[unknown]' ); $xmlrpc_fail .= " (community: $data->{usejournal})" if $data->{usejournal}; if ( !$hash || $hash->{fault} ) { $temp_fail->($xmlrpc_fail); return 0; } # good, import this event $process_entry->($_) foreach @{ $hash->{events} || [] }; return 1; }; # now get the actual events my @toload; foreach my $jid ( 0 .. $maxid ) { push @toload, $jid unless exists $has{$jid} && $has{$jid}; if ( scalar @toload == 100 ) { return unless $fetch_events->(@toload); @toload = (); } } if ( scalar @toload > 0 ) { return unless $fetch_events->(@toload); } # mark the comments mode as ready to schedule my $dbh = LJ::get_db_writer(); $dbh->do( q{UPDATE import_items SET status = 'ready' WHERE userid = ? AND item IN ('lj_comments') AND import_data_id = ? AND status = 'init'}, undef, $u->id, $opts->{import_data_id} ); # Kick off a indexing job for this user if (@LJ::SPHINX_SEARCHD) { DW::TaskQueue->dispatch( DW::Task::SphinxCopier->new( { userid => $u->id, source => "importen" } ) ); } return $ok->(); } sub find_prunable_items { my ( $class, $u, $entry_map, $xpost_map, %opts ) = @_; my $log = $opts{log_sub} || sub { warn @_ }; my $force_reimport = $opts{force_reimport}; my $hostname = $opts{hostname}; my $username = $opts{username}; # this is an optimization. since we never do an edit event (only post!) we will # never get changes anyway. so let's remove from the list of things to sync any # post that we already know about. (not that we really care, but it's much nicer # on people we're pulling from.) my %has; # jitemids we have foreach my $url ( keys %$entry_map ) { # but first, let's skip anything that isn't from the server we are importing # from. this assumes URLs never have other hostnames, so if someone were to # register testlivejournal.com and do an import, they will have trouble # importing. if they want to do that to befunge this logic, more power to them. $url =~ s/-/_/g; # makes \b work below next unless $url =~ /\Q$hostname\E/ && $url =~ /\b$username\b/; unless ( $url =~ m!/(\d+)(?:\.html)?$! ) { $log->( 'URL %s not of expected format in prune.', $url ); next; } # if we want a paid user or someone to get their picture keyword updated, # skip them here. next if $force_reimport; # yes, we can has this jitemid from the remote side $has{ $1 >> 8 } = 1; } $log->( 'Identified %d items we already know about (first pass).', scalar( keys %has ) ); $log->( 'memory usage is now %dMB', LJ::gtop()->proc_mem($$)->resident / 1024 / 1024 ); # this is another optimization. we know crossposted entries can be removed from # the list of things we will import, as we generated them to begin with. foreach my $itemid ( keys %$xpost_map ) { $has{$itemid} = 1; } $log->( 'Identified %d items we already know about (second pass).', scalar( keys %has ) ); $log->( 'memory usage is now %dMB', LJ::gtop()->proc_mem($$)->resident / 1024 / 1024 ); my $duplicates_map = DW::Worker::ContentImporter::Local::Entries->get_duplicates_map( $u, \%has ) || {}; $log->( 'Found %d items to check for duplicates.', scalar( keys %$duplicates_map ) ); $log->( 'memory usage is now %dMB', LJ::gtop()->proc_mem($$)->resident / 1024 / 1024 ); return ( $duplicates_map, %has ); } 1;