mourningdove/cgi-bin/LJ/ParseFeed.pm
2026-05-24 01:03:05 +00:00

611 lines
19 KiB
Perl

# This code was forked from the LiveJournal project owned and operated
# by Live Journal, Inc. The code has been modified and expanded by
# Dreamwidth Studios, LLC. These files were originally licensed under
# the terms of the license supplied by Live Journal, Inc, which can
# currently be found at:
#
# http://code.livejournal.org/trac/livejournal/browser/trunk/LICENSE-LiveJournal.txt
#
# In accordance with the original license, this code and all its
# modifications are provided under the GNU General Public License.
# A copy of that license can be found in the LICENSE file included as
# part of this distribution.
package LJ::ParseFeed;
use strict;
use DW::XML::RSS;
use DW::XML::Parser;
# <LJFUNC>
# name: LJ::ParseFeed::parse_feed
# des: Parses an RSS/Atom feed.
# class:
# args: content, type?
# des-content: Feed content.
# des-type: Optional; can be "atom" or "rss".
# If type isn't supplied, the function will try to guess it
# based on contents.
# info: items - An arrayref of item hashes, in the same order they were
# in the feed.
# Each item contains: link - URL of the item; id - unique identifier (optional);
# text - text of the item; subject - subject;
# time - in format 'yyyy-mm-dd hh:mm' (optional).
# returns: Three arguments: $feed, $error, arrayref of items.
# $feed, which is a hash with the following keys:
# type - 'atom' or 'rss'; version - version of the feed in its
# standard; link - URL of the feed; title - title of the feed;
# description - description of the feed.
# The second argument returned is $error, which, if defined, is a
# human-readable error string. The third argument is an
# arrayref of items, same as $feed->{'items'}.
# </LJFUNC>
sub parse_feed {
my ( $content, $type ) = @_;
my ( $feed, $items, $error );
my $parser;
# is it RSS or Atom?
# Atom feeds are rare for now, so prefer to err in favor of RSS
# simple heuristic: Atom feeds will have '<feed' somewhere
# TODO: maybe store the feed's type on creation in a userprop and not guess here
if ( ( defined $type && $type eq 'atom' ) || $content =~ m!\<feed! ) {
# try treating it as an atom feed
$parser = new DW::XML::Parser(
Style => 'Stream',
Namespaces => 1,
Pkg => 'LJ::ParseFeed::Atom'
);
return ( "", "failed to create XML parser" ) unless $parser;
eval { $parser->parse($content); };
if ($@) {
$error = "XML parser error: $@";
}
else {
( $feed, $items, $error ) = LJ::ParseFeed::Atom::results();
}
if ( $feed || $type eq 'atom' ) {
# there was a top-level <feed> there, or we're forced to treat
# as an Atom feed, so even if $error is set,
# don't try RSS
$feed->{'type'} = 'atom';
return ( $feed, $error, $items );
}
}
# try parsing it as RSS
$parser = new DW::XML::RSS;
return ( "", "failed to create RSS parser" ) unless $parser;
# custom LJ/DW namespaces
$parser->add_module(
prefix => 'nslj',
uri => 'http://www.livejournal.org/rss/lj/1.0/'
);
$parser->add_module(
prefix => 'atom',
uri => 'http://www.w3.org/2005/Atom'
);
eval { $parser->parse($content); };
if ($@) {
$error = "RSS parser error: $@";
return ( "", $error );
}
$feed = {};
$feed->{'type'} = 'rss';
$feed->{'version'} = $parser->{'version'};
foreach (qw (link title description)) {
$feed->{$_} = $parser->{'channel'}->{$_}
if $parser->{'channel'}->{$_};
}
$feed->{'atom:id'} = $parser->{channel}->{atom}->{id} if defined $parser->{channel}->{atom};
$feed->{'items'} = [];
foreach ( @{ $parser->{'items'} } ) {
my $item = {};
$item->{'subject'} = $_->{'title'};
$item->{'text'} = $_->{'description'};
$item->{'link'} = $_->{'link'} if $_->{'link'};
$item->{'id'} = $_->{'guid'} if $_->{'guid'};
my $nsenc = 'http://purl.org/rss/1.0/modules/content/';
if ( $_->{$nsenc} && ref( $_->{$nsenc} ) eq "HASH" ) {
# prefer content:encoded if present
$item->{'text'} = $_->{$nsenc}->{'encoded'}
if defined $_->{$nsenc}->{'encoded'};
}
my ( $time, $author );
$time = time822_to_time( $_->{pubDate} ) if $_->{pubDate};
$author = $_->{nslj}->{poster}
if $_->{nslj} && ref $_->{nslj} eq "HASH";
# Dublin Core
if ( $_->{dc} && ref $_->{dc} eq "HASH" ) {
if ( $_->{dc}->{creator} ) {
my $creator = $_->{dc}->{creator};
$author = ref $creator eq 'ARRAY' ? join( ', ', @$creator ) : $creator;
}
$time = w3cdtf_to_time( $_->{dc}->{date} ) if $_->{dc}->{date};
}
$item->{time} = $time if $time;
$item->{author} = $author if $author;
push @{ $feed->{items} }, $item;
}
return ( $feed, undef, $feed->{'items'} );
}
# convert rfc822-time in RSS's <pubDate> to our time
# see http://www.faqs.org/rfcs/rfc822.html
# RFC822 specifies 2 digits for year, and RSS2.0 refers to RFC822,
# but real RSS2.0 feeds apparently use 4 digits.
sub time822_to_time {
my $t822 = shift;
# remove day name if present
$t822 =~ s/^\s*\w+\s*,//;
# remove whitespace
$t822 =~ s/^\s*//;
# break it up
if ( $t822 =~ m!(\d?\d)\s+(\w+)\s+(\d\d\d\d)\s+(\d?\d):(\d\d)! ) {
my ( $day, $mon, $year, $hour, $min ) = ( $1, $2, $3, $4, $5 );
$day = "0" . $day if length($day) == 1;
$hour = "0" . $hour if length($hour) == 1;
$mon = {
'Jan' => '01',
'Feb' => '02',
'Mar' => '03',
'Apr' => '04',
'May' => '05',
'Jun' => '06',
'Jul' => '07',
'Aug' => '08',
'Sep' => '09',
'Oct' => '10',
'Nov' => '11',
'Dec' => '12'
}->{$mon};
return undef unless $mon;
return "$year-$mon-$day $hour:$min";
}
else {
return undef;
}
}
# convert W3C-DTF to our internal format
# see http://www.w3.org/TR/NOTE-datetime
# Based very loosely on code from DateTime::Format::W3CDTF,
# which isn't stable yet so we can't use it directly.
sub w3cdtf_to_time {
my $tw3 = shift;
# TODO: Should somehow return the timezone offset
# so that it can stored... but we don't do timezones
# yet anyway. For now, just strip the timezone
# portion if it is present, along with the decimal
# fractions of a second.
$tw3 =~ s/(?:\.\d+)?(?:[+-]\d{1,2}:\d{1,2}|Z)$//;
$tw3 =~ s/^\s*//;
$tw3 =~ s/\s*$//; # Eat any superflous whitespace
# We can only use complete times, so anything which
# doesn't feature the time part is considered invalid.
# This is working around clients that don't implement W3C-DTF
# correctly, and only send single digit values in the dates.
# 2004-4-8T16:9:4Z vs 2004-04-08T16:09:44Z
# If it's more messed up than that, reject it outright.
$tw3 =~ /^(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?$/
or return undef;
my %pd; # parsed date
$pd{Y} = $1;
$pd{M} = $2;
$pd{D} = $3;
$pd{h} = $4;
$pd{m} = $5;
$pd{s} = $6;
# force double digits
foreach (qw/ M D h m s /) {
next unless defined $pd{$_};
$pd{$_} = sprintf "%02d", $pd{$_};
}
return $pd{s}
? "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}:$pd{s}"
: "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}";
}
package LJ::ParseFeed::Atom;
our ( $feed, $item, $data );
our ( $ddepth, $dholder ); # for accumulating;
our @items;
our $error;
sub err {
$error = shift unless $error;
}
sub results {
return ( $feed, \@items, $error );
}
# $name under which we'll store accumulated data may be different
# from $tag which causes us to store it
# $name may be a scalarref pointing to where we should store
# swallowing is achieved by calling startaccum('');
sub startaccum {
my $name = shift;
return err ("Tag found under neither <feed> nor <entry>")
unless $feed || $item;
$data = ""; # defining $data triggers accumulation
$ddepth = 1;
if ($name) {
# if $name is a scalarref, it's actually our $dholder
if ( ref $name eq 'SCALAR' ) {
$dholder = $name;
}
else {
$dholder = $item ? \$item->{$name} : \$feed->{$name};
}
}
else {
$dholder = undef; # no $name
}
return;
}
sub swallow {
return startaccum('');
}
sub StartDocument {
( $feed, $item, $data ) = ( undef, undef, undef );
@items = ();
undef $error;
}
sub StartTag {
# $_ carries the unparsed tag
my ( $p, $tag ) = @_;
my $holder;
# do nothing if there has been an error
return if $error;
# are we just accumulating data?
if ( defined $data ) {
$data .= $_;
$ddepth++;
return;
}
# where we'll usually store info
$holder = $item ? $item : $feed;
TAGS: {
if ( $tag eq 'feed' ) {
return err ("Nested <feed> tags")
if $feed;
$feed = {};
$feed->{'standard'} = 'atom';
$feed->{'version'} = $_{'version'};
return err ("Incompatible version specified in <feed>")
if $feed->{'version'} && $feed->{'version'} < 0.3;
last TAGS;
}
if ( $tag eq 'entry' ) {
return err ("Nested <entry> tags")
if $item;
$item = {};
last TAGS;
}
# at this point, we must have a top-level <feed> or <entry>
# to write into
return err ("Tag found under neither <feed> nor <entry>")
unless $holder;
if ( $tag eq 'link' ) {
# store 'self' and 'hub' rels, for PubSubHubbub support; but only valid
# for the feed, so make sure $item is undef
if ( !$item && $_{rel} && ( $_{rel} eq 'self' || $_{rel} eq 'hub' ) ) {
return err ('Feed not yet defined')
unless $feed;
# allow these to be specified multiple times, the spec allows for multiple
# hubs. the self link shouldn't allow multiples but it won't hurt if we let it.
push @{ $feed->{ $_{rel} } ||= [] }, $_{href};
last TAGS;
}
# ignore links with rel= anything but alternate
# and treat links as rel=alternate if not explicit
unless ( !$_{'rel'} || $_{'rel'} eq 'alternate' ) {
swallow();
last TAGS;
}
# if multiple alternates are specified, prefer the one
# that doesn't have a type of text/plain.
# see also t/parsefeed-atom-link2.t
if ( $holder->{link} && $_{type} && $_{type} eq 'text/plain' ) {
swallow();
last TAGS;
}
$holder->{'link'} = $_{'href'};
return err ("No href attribute in <link>")
unless $holder->{'link'};
last TAGS;
}
if ( $tag eq 'content' ) {
return err ("<content> outside <entry>")
unless $item;
# if type is multipart/alternative, we continue recursing
# otherwise we accumulate
my $type = $_{'type'} || "text/plain";
unless ( $type eq "multipart/alternative" ) {
push @{ $item->{'contents'} }, [ $type, "" ];
startaccum( \$item->{'contents'}->[-1]->[1] );
last TAGS;
}
# it's multipart/alternative, so recurse, but don't swallow
last TAGS;
}
# we want to store the value of the nested <name> element
# in the author slot, not accumulate the raw value -
# use temp key "inauth" to detect the nesting
if ( $tag eq 'author' ) {
$holder->{inauth} = 1;
last TAGS;
}
if ( $tag eq 'name' ) {
if ( $holder->{inauth} ) {
startaccum('author');
}
else {
swallow();
}
last TAGS;
}
if ( $tag eq 'poster' ) {
$holder->{ljposter} = $_{user};
return err ("No user attribute in <$tag>")
unless $holder->{ljposter};
last TAGS;
}
# store tags which should require no further
# processing as they are, and others under _atom_*, to be processed
# in EndTag under </entry>
if ( $tag eq 'title' ) {
if ($item) { # entry's subject
startaccum("subject");
}
else { # feed's title
startaccum($tag);
}
last TAGS;
}
if ( $tag eq 'atom:id' || $tag eq 'id' ) {
startaccum($tag);
last TAGS;
}
if ( $tag eq 'tagline' && !$item ) { # feed's tagline, our "description"
startaccum("description");
last TAGS;
}
# accumulate and store
startaccum( "_atom_" . $tag );
last TAGS;
}
return;
}
sub EndTag {
# $_ carries the unparsed tag
my ( $p, $tag ) = @_;
# do nothing if there has been an error
return if $error;
# are we accumulating data?
if ( defined $data ) {
$ddepth--;
if ( $ddepth == 0 ) { # stop accumulating
$$dholder = $data
if $dholder;
undef $data;
return;
}
$data .= $_;
return;
}
TAGS: {
if ( $tag eq 'entry' ) {
# finalize item...
# generate suitable text from $item->{'contents'}
my $content;
$item->{'contents'} ||= [];
unless ( scalar( @{ $item->{'contents'} } ) >= 1 ) {
# this item had no <content>
# maybe it has <summary>? if so, use <summary>
# TODO: type= or encoding issues here? perhaps unite
# handling of <summary> with that of <content>?
if ( $item->{'_atom_summary'} ) {
$item->{'text'} = $item->{'_atom_summary'};
delete $item->{'contents'};
}
else {
# nothing to display, so ignore this entry
undef $item;
last TAGS;
}
}
unless ( $item->{'text'} ) { # unless we already have text
if ( scalar( @{ $item->{'contents'} } ) == 1 ) {
# only one <content> section
$content = $item->{'contents'}->[0];
}
else {
# several <content> section, must choose the best one
foreach ( @{ $item->{'contents'} } ) {
if ( $_->[0] eq "application/xhtml+xml" ) { # best match
$content = $_;
last; # don't bother to look at others
}
if ( $_->[0] =~ m!html! ) { # some kind of html/xhtml/html+xml, etc.
# choose this unless we've already chosen some html
$content = $_
unless $content->[0] =~ m!html!;
next;
}
if ( $_->[0] eq "text/plain" ) {
# choose this unless we have some html already
$content = $_
unless $content->[0] =~ m!html!;
next;
}
}
# if we didn't choose anything, pick the first one
$content = $item->{'contents'}->[0]
unless $content;
}
# we ignore the 'mode' attribute of <content>. If it's "xml", we've
# stringified it by accumulation; if it's "escaped", our parser
# unescaped it
# TODO: handle mode=base64?
$item->{'text'} = $content->[1];
delete $item->{'contents'};
}
# generate time
my $w3time =
$item->{'_atom_created'}
|| $item->{'_atom_published'}
|| $item->{'_atom_modified'}
|| $item->{'_atom_updated'};
my $time;
if ($w3time) {
# see http://www.w3.org/TR/NOTE-datetime for format
# we insist on having granularity up to a minute,
# and ignore finer data as well as the timezone, for now
if ( $w3time =~ m!^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d)! ) {
$time = "$1-$2-$3 $4:$5";
}
}
$item->{time} = $time if $time;
# if we found ljposter, use that as preferred author
$item->{author} = $item->{ljposter} if defined $item->{ljposter};
delete $item->{ljposter};
# get rid of all other tags we don't need anymore
foreach ( keys %$item ) {
delete $item->{$_} if substr( $_, 0, 6 ) eq '_atom_';
}
push @items, $item;
undef $item;
last TAGS;
}
if ( $tag eq 'author' ) {
my $holder = $item ? $item : $feed;
delete $holder->{inauth};
last TAGS;
}
if ( $tag eq 'feed' ) {
# finalize feed
# if feed author exists, all items should default to it
if ( defined $feed->{author} ) {
$_->{author} ||= $feed->{author} foreach @items;
}
# get rid of all other tags we don't need anymore
foreach ( keys %$feed ) {
delete $feed->{$_} if substr( $_, 0, 6 ) eq '_atom_';
}
# link the feed with its itms
$feed->{'items'} = \@items
if $feed;
last TAGS;
}
}
return;
}
sub Text {
my $p = shift;
# do nothing if there has been an error
return if $error;
$data .= $_ if defined $data;
}
sub PI {
# ignore processing instructions
return;
}
sub EndDocument {
# if we parsed a feed, link items to it
$feed->{'items'} = \@items
if $feed;
return;
}
1;