222 lines
7.8 KiB
Perl
Executable file
222 lines
7.8 KiB
Perl
Executable file
#!/usr/bin/perl
|
|
|
|
use v5.10;
|
|
use strict;
|
|
BEGIN { require "$ENV{LJHOME}/cgi-bin/ljlib.pl"; }
|
|
|
|
use LJ::Entry;
|
|
use LJ::Talk;
|
|
use DW::Worker::ContentImporter::LiveJournal;
|
|
use DW::Worker::ContentImporter::Local::Entries;
|
|
use DW::Worker::ContentImporter::Local::Comments;
|
|
|
|
use Digest::MD5 qw/ md5_hex /;
|
|
use Getopt::Long;
|
|
|
|
my ( $user, $confirm, $dupes );
|
|
GetOptions(
|
|
'user=s' => \$user,
|
|
'confirm=s' => \$confirm,
|
|
'dupes-only' => \$dupes,
|
|
);
|
|
|
|
my $u = LJ::load_user( $user )
|
|
or die "Usage: $0 -u USER -c CODEWORD [ --dupes-only ]\n";
|
|
$confirm = $confirm && $confirm eq 'b00p' ? 1 : 0;
|
|
|
|
# Select posts that were imported
|
|
my %map = %{ DW::Worker::ContentImporter::Local::Entries->get_entry_map( $u ) || {} };
|
|
unless ( scalar keys %map > 0 ) {
|
|
say 'Account has no imported entries, nothing to do.';
|
|
exit 0;
|
|
}
|
|
my %rmap;
|
|
foreach my $key ( keys %map ) {
|
|
$rmap{$map{$key}} = $key;
|
|
}
|
|
|
|
# Nuke all entries that have been imported.
|
|
my %csrc_in = %{ DW::Worker::ContentImporter::Local::Comments->get_comment_map( $u ) || {} };
|
|
my %csrc;
|
|
$csrc{$csrc_in{$_}} = $_ foreach keys %csrc_in; # Invert it.
|
|
|
|
# If we're in dupes mode, select out all entries to calculate dupes. Does not use entry body
|
|
# because sometimes that differs due to <user> vs <lj> tag changes as we updated the importer
|
|
# over the years.
|
|
my $dbcr = LJ::get_cluster_reader( $u ) or die;
|
|
my $entries = $dbcr->selectall_hashref(q{
|
|
SELECT log2.jitemid, posterid, eventtime, replycount, subject, event, security, allowmask
|
|
FROM log2 INNER JOIN logtext2
|
|
ON log2.journalid = logtext2.journalid AND log2.jitemid = logtext2.jitemid
|
|
WHERE log2.journalid = ?
|
|
}, 'jitemid', undef, $u->id );
|
|
|
|
for my $jitemid ( keys %$entries ) {
|
|
$entries->{$jitemid}->{subject} = md5_hex( $entries->{$jitemid}->{subject} );
|
|
$entries->{$jitemid}->{event} = md5_hex( $entries->{$jitemid}->{event} );
|
|
|
|
my %cmts = %{ LJ::Talk::get_talk_data( $u, 'L', $jitemid ) || {} };
|
|
foreach my $jtalkid ( keys %cmts ) {
|
|
next if $cmts{$jtalkid}->{state} =~ /^[DSB]$/;
|
|
|
|
$entries->{$jitemid}->{replycount}--
|
|
if exists $csrc{$jtalkid};
|
|
if ( $entries->{$jitemid}->{replycount} < 0 ) {
|
|
die "Invalid accounting! $jitemid replycount went negative!\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
sub entry_key {
|
|
my $jitemid = $_[0] + 0;
|
|
|
|
# Remap content, just in case (this only works for LJ; given that's the vast majority
|
|
# of things, I don't care for now)
|
|
if ( $entries->{$jitemid}->{subject} =~ /<lj/i ) {
|
|
$entries->{$jitemid}->{subject} =
|
|
DW::Worker::ContentImporter::LiveJournal->remap_lj_user(
|
|
{ hostname => 'livejournal.com' },
|
|
$entries->{$jitemid}->{subject}
|
|
);
|
|
}
|
|
|
|
die 'not found in list' unless exists $entries->{$jitemid};
|
|
return join '.', map { $entries->{$jitemid}->{$_} } qw/ posterid eventtime subject /;
|
|
}
|
|
my %dupects;
|
|
foreach my $jitemid ( keys %$entries ) {
|
|
my $key = entry_key( $jitemid );
|
|
push @{$dupects{$key} ||= []}, $jitemid;
|
|
}
|
|
if ( $dupes ) {
|
|
say 'Possible duplicates:';
|
|
foreach my $key ( sort keys %dupects ) {
|
|
print " * $key: " . join(', ', map { "$_($entries->{$_}->{replycount})" } @{$dupects{$key}}) . "\n"
|
|
if scalar @{$dupects{$key}} > 1;
|
|
}
|
|
}
|
|
|
|
# Iterate each imported entry and see if we can delete it
|
|
ENTRY: foreach my $jitemid ( sort { $b <=> $a } keys %rmap ) {
|
|
# Load entry and attempt to canonicalize source if we need to; if we can't
|
|
# canonicalize the source then we must delete this entry
|
|
my $o_entry = LJ::Entry->new( $u, jitemid => $jitemid );
|
|
goto DELETE unless canonicalize_source( $o_entry );
|
|
|
|
# Handle duplicate stuff first
|
|
my $key = entry_key( $jitemid );
|
|
if ( $dupes ) {
|
|
next ENTRY unless
|
|
exists $dupects{$key} && scalar @{$dupects{$key}} > 1;
|
|
|
|
# If we get here, this means there are duplicates of this entry --
|
|
# at least two still remain (this one + one more). Possibly more.
|
|
my %tmp_entries;
|
|
|
|
# Now iterate dupes
|
|
foreach my $dupeid ( @{$dupects{$key}} ) {
|
|
next if $dupeid == $jitemid;
|
|
|
|
my $entry = LJ::Entry->new( $u, jitemid => $dupeid );
|
|
|
|
if ( $o_entry->security eq "public" ) {
|
|
my ( $domain, $user, $ditemid ) = split( m!/!, $rmap{$jitemid} );
|
|
print "$jitemid ( $rmap{$jitemid} ) $dupeid: " . $entry->prop( 'import_source' ) . "\n";
|
|
print " * imported: " . $o_entry->url . "\n";
|
|
print " * dupe: " . $entry->url . "\n";
|
|
print " * original: http://$user.$domain/$ditemid.html\n";
|
|
}
|
|
|
|
# MAGICAL EDGE CASE: find "imported entries" that aren't marked with
|
|
# an import_source. This happened to one user, one time...
|
|
unless ( $entry->prop( 'import_source' ) ) {
|
|
$entry->set_prop( import_source => $o_entry->prop( 'import_source' ) );
|
|
}
|
|
|
|
# Another possibility: we got dupes because one entry isn't using the
|
|
# canonical form, so let's try to canonicalize
|
|
canonicalize_source( $entry );
|
|
}
|
|
|
|
# If any organic comments, skip this entry
|
|
next ENTRY if $entries->{$jitemid}->{replycount} > 0;
|
|
|
|
# Has no comments, or only imported comments
|
|
goto DELETE;
|
|
}
|
|
|
|
# See if there are any non-imported contents on the entry
|
|
next ENTRY if $entries->{$jitemid}->{replycount} > 0;
|
|
|
|
# If we get here, the entry is destined for the shredder
|
|
DELETE:
|
|
print "$rmap{$jitemid} (jitemid $jitemid) scheduled for deletion";
|
|
if ( $confirm ) {
|
|
my $rv = LJ::delete_entry( $u, $jitemid, 0, undef );
|
|
if ( $rv ) {
|
|
say ' ... deleted';
|
|
} else {
|
|
say ' ... FAILED TO DELETE';
|
|
}
|
|
} else {
|
|
say ' ... confirmation not set';
|
|
}
|
|
|
|
# Paperwork: since we deleted this entry we need to remove it from the
|
|
# duplicates tracking, as it can no longer 'cause' duplicates
|
|
$dupects{$key} = [ grep { $_ != $jitemid } @{$dupects{$key}} ];
|
|
}
|
|
|
|
# Delete comments with a nodeid of 0. This should never happen, but has been
|
|
# known to happen to some imported comments.
|
|
if ( $confirm ) {
|
|
LJ::delete_all_comments( $u, 'L', 0 );
|
|
}
|
|
|
|
|
|
exit 0;
|
|
|
|
|
|
|
|
sub canonicalize_source {
|
|
my $entry = $_[0];
|
|
my $source = $entry->prop( 'import_source' );
|
|
return 1 if $source =~ m!^(?:livejournal|insanejournal)\.com/[a-z0-9_]+/\d+$!;
|
|
|
|
# The dupe fixer had a bug in it 5 minutes ago where this could happen...
|
|
if ( $source =~ m!^livejournal\.com//(\d+)$! ) {
|
|
my $new_source = "livejournal.com/$user/$1";
|
|
say "Fixing import_source: $source => $new_source";
|
|
$entry->set_prop( import_source => $new_source );
|
|
return 1;
|
|
}
|
|
|
|
# http://runpunkrun.livejournal.com/334.html
|
|
if ( $source =~ m!http://([a-z0-9_-]*)\.((?:livejournal|insanejournal)\.com)/(\d+)\.html$! ) {
|
|
my ( $host, $loc_user, $ditemid ) = ( $2, $1, $3 );
|
|
$loc_user =~ s/-/_/g;
|
|
|
|
# If the source didn't have a user, this was one class of issues where we
|
|
# had an undefined variable ages ago. Add it.
|
|
$loc_user ||= $user;
|
|
|
|
my $new_source = "$host/$loc_user/$ditemid";
|
|
say "Fixing import_source: $source => $new_source";
|
|
$entry->set_prop( import_source => $new_source );
|
|
return 1;
|
|
}
|
|
|
|
# There was, at one point, a bad bug that showed up with the following source,
|
|
# since there are 90 LJ imports for every IJ import (or more) let's just assume
|
|
# that it's LJ... and the same username
|
|
if ( $source =~ m!^#/(\d+)\.html$! ) {
|
|
my ( $ditemid ) = ( $1 );
|
|
my $new_source = "livejournal.com/$user/$ditemid";
|
|
say "Fixing import_source: $source => $new_source";
|
|
$entry->set_prop( import_source => $new_source );
|
|
return 1;
|
|
}
|
|
|
|
say "UNKNOWN SOURCE FORMAT: $source";
|
|
return 0;
|
|
}
|