#!/usr/bin/perl use v5.10; use strict; BEGIN { require "$ENV{LJHOME}/cgi-bin/ljlib.pl"; } use LJ::Entry; use LJ::Talk; use DW::Worker::ContentImporter::LiveJournal; use DW::Worker::ContentImporter::Local::Entries; use DW::Worker::ContentImporter::Local::Comments; use Digest::MD5 qw/ md5_hex /; use Getopt::Long; my ( $user, $confirm, $dupes ); GetOptions( 'user=s' => \$user, 'confirm=s' => \$confirm, 'dupes-only' => \$dupes, ); my $u = LJ::load_user( $user ) or die "Usage: $0 -u USER -c CODEWORD [ --dupes-only ]\n"; $confirm = $confirm && $confirm eq 'b00p' ? 1 : 0; # Select posts that were imported my %map = %{ DW::Worker::ContentImporter::Local::Entries->get_entry_map( $u ) || {} }; unless ( scalar keys %map > 0 ) { say 'Account has no imported entries, nothing to do.'; exit 0; } my %rmap; foreach my $key ( keys %map ) { $rmap{$map{$key}} = $key; } # Nuke all entries that have been imported. my %csrc_in = %{ DW::Worker::ContentImporter::Local::Comments->get_comment_map( $u ) || {} }; my %csrc; $csrc{$csrc_in{$_}} = $_ foreach keys %csrc_in; # Invert it. # If we're in dupes mode, select out all entries to calculate dupes. Does not use entry body # because sometimes that differs due to vs tag changes as we updated the importer # over the years. my $dbcr = LJ::get_cluster_reader( $u ) or die; my $entries = $dbcr->selectall_hashref(q{ SELECT log2.jitemid, posterid, eventtime, replycount, subject, event, security, allowmask FROM log2 INNER JOIN logtext2 ON log2.journalid = logtext2.journalid AND log2.jitemid = logtext2.jitemid WHERE log2.journalid = ? }, 'jitemid', undef, $u->id ); for my $jitemid ( keys %$entries ) { $entries->{$jitemid}->{subject} = md5_hex( $entries->{$jitemid}->{subject} ); $entries->{$jitemid}->{event} = md5_hex( $entries->{$jitemid}->{event} ); my %cmts = %{ LJ::Talk::get_talk_data( $u, 'L', $jitemid ) || {} }; foreach my $jtalkid ( keys %cmts ) { next if $cmts{$jtalkid}->{state} =~ /^[DSB]$/; $entries->{$jitemid}->{replycount}-- if exists $csrc{$jtalkid}; if ( $entries->{$jitemid}->{replycount} < 0 ) { die "Invalid accounting! $jitemid replycount went negative!\n"; } } } sub entry_key { my $jitemid = $_[0] + 0; # Remap content, just in case (this only works for LJ; given that's the vast majority # of things, I don't care for now) if ( $entries->{$jitemid}->{subject} =~ /{$jitemid}->{subject} = DW::Worker::ContentImporter::LiveJournal->remap_lj_user( { hostname => 'livejournal.com' }, $entries->{$jitemid}->{subject} ); } die 'not found in list' unless exists $entries->{$jitemid}; return join '.', map { $entries->{$jitemid}->{$_} } qw/ posterid eventtime subject /; } my %dupects; foreach my $jitemid ( keys %$entries ) { my $key = entry_key( $jitemid ); push @{$dupects{$key} ||= []}, $jitemid; } if ( $dupes ) { say 'Possible duplicates:'; foreach my $key ( sort keys %dupects ) { print " * $key: " . join(', ', map { "$_($entries->{$_}->{replycount})" } @{$dupects{$key}}) . "\n" if scalar @{$dupects{$key}} > 1; } } # Iterate each imported entry and see if we can delete it ENTRY: foreach my $jitemid ( sort { $b <=> $a } keys %rmap ) { # Load entry and attempt to canonicalize source if we need to; if we can't # canonicalize the source then we must delete this entry my $o_entry = LJ::Entry->new( $u, jitemid => $jitemid ); goto DELETE unless canonicalize_source( $o_entry ); # Handle duplicate stuff first my $key = entry_key( $jitemid ); if ( $dupes ) { next ENTRY unless exists $dupects{$key} && scalar @{$dupects{$key}} > 1; # If we get here, this means there are duplicates of this entry -- # at least two still remain (this one + one more). Possibly more. my %tmp_entries; # Now iterate dupes foreach my $dupeid ( @{$dupects{$key}} ) { next if $dupeid == $jitemid; my $entry = LJ::Entry->new( $u, jitemid => $dupeid ); if ( $o_entry->security eq "public" ) { my ( $domain, $user, $ditemid ) = split( m!/!, $rmap{$jitemid} ); print "$jitemid ( $rmap{$jitemid} ) $dupeid: " . $entry->prop( 'import_source' ) . "\n"; print " * imported: " . $o_entry->url . "\n"; print " * dupe: " . $entry->url . "\n"; print " * original: http://$user.$domain/$ditemid.html\n"; } # MAGICAL EDGE CASE: find "imported entries" that aren't marked with # an import_source. This happened to one user, one time... unless ( $entry->prop( 'import_source' ) ) { $entry->set_prop( import_source => $o_entry->prop( 'import_source' ) ); } # Another possibility: we got dupes because one entry isn't using the # canonical form, so let's try to canonicalize canonicalize_source( $entry ); } # If any organic comments, skip this entry next ENTRY if $entries->{$jitemid}->{replycount} > 0; # Has no comments, or only imported comments goto DELETE; } # See if there are any non-imported contents on the entry next ENTRY if $entries->{$jitemid}->{replycount} > 0; # If we get here, the entry is destined for the shredder DELETE: print "$rmap{$jitemid} (jitemid $jitemid) scheduled for deletion"; if ( $confirm ) { my $rv = LJ::delete_entry( $u, $jitemid, 0, undef ); if ( $rv ) { say ' ... deleted'; } else { say ' ... FAILED TO DELETE'; } } else { say ' ... confirmation not set'; } # Paperwork: since we deleted this entry we need to remove it from the # duplicates tracking, as it can no longer 'cause' duplicates $dupects{$key} = [ grep { $_ != $jitemid } @{$dupects{$key}} ]; } # Delete comments with a nodeid of 0. This should never happen, but has been # known to happen to some imported comments. if ( $confirm ) { LJ::delete_all_comments( $u, 'L', 0 ); } exit 0; sub canonicalize_source { my $entry = $_[0]; my $source = $entry->prop( 'import_source' ); return 1 if $source =~ m!^(?:livejournal|insanejournal)\.com/[a-z0-9_]+/\d+$!; # The dupe fixer had a bug in it 5 minutes ago where this could happen... if ( $source =~ m!^livejournal\.com//(\d+)$! ) { my $new_source = "livejournal.com/$user/$1"; say "Fixing import_source: $source => $new_source"; $entry->set_prop( import_source => $new_source ); return 1; } # http://runpunkrun.livejournal.com/334.html if ( $source =~ m!http://([a-z0-9_-]*)\.((?:livejournal|insanejournal)\.com)/(\d+)\.html$! ) { my ( $host, $loc_user, $ditemid ) = ( $2, $1, $3 ); $loc_user =~ s/-/_/g; # If the source didn't have a user, this was one class of issues where we # had an undefined variable ages ago. Add it. $loc_user ||= $user; my $new_source = "$host/$loc_user/$ditemid"; say "Fixing import_source: $source => $new_source"; $entry->set_prop( import_source => $new_source ); return 1; } # There was, at one point, a bad bug that showed up with the following source, # since there are 90 LJ imports for every IJ import (or more) let's just assume # that it's LJ... and the same username if ( $source =~ m!^#/(\d+)\.html$! ) { my ( $ditemid ) = ( $1 ); my $new_source = "livejournal.com/$user/$ditemid"; say "Fixing import_source: $source => $new_source"; $entry->set_prop( import_source => $new_source ); return 1; } say "UNKNOWN SOURCE FORMAT: $source"; return 0; }