611 lines
19 KiB
Perl
611 lines
19 KiB
Perl
# This code was forked from the LiveJournal project owned and operated
|
|
# by Live Journal, Inc. The code has been modified and expanded by
|
|
# Dreamwidth Studios, LLC. These files were originally licensed under
|
|
# the terms of the license supplied by Live Journal, Inc, which can
|
|
# currently be found at:
|
|
#
|
|
# http://code.livejournal.org/trac/livejournal/browser/trunk/LICENSE-LiveJournal.txt
|
|
#
|
|
# In accordance with the original license, this code and all its
|
|
# modifications are provided under the GNU General Public License.
|
|
# A copy of that license can be found in the LICENSE file included as
|
|
# part of this distribution.
|
|
|
|
package LJ::ParseFeed;
|
|
use strict;
|
|
|
|
use DW::XML::RSS;
|
|
use DW::XML::Parser;
|
|
|
|
# <LJFUNC>
|
|
# name: LJ::ParseFeed::parse_feed
|
|
# des: Parses an RSS/Atom feed.
|
|
# class:
|
|
# args: content, type?
|
|
# des-content: Feed content.
|
|
# des-type: Optional; can be "atom" or "rss".
|
|
# If type isn't supplied, the function will try to guess it
|
|
# based on contents.
|
|
# info: items - An arrayref of item hashes, in the same order they were
|
|
# in the feed.
|
|
# Each item contains: link - URL of the item; id - unique identifier (optional);
|
|
# text - text of the item; subject - subject;
|
|
# time - in format 'yyyy-mm-dd hh:mm' (optional).
|
|
# returns: Three arguments: $feed, $error, arrayref of items.
|
|
# $feed, which is a hash with the following keys:
|
|
# type - 'atom' or 'rss'; version - version of the feed in its
|
|
# standard; link - URL of the feed; title - title of the feed;
|
|
# description - description of the feed.
|
|
# The second argument returned is $error, which, if defined, is a
|
|
# human-readable error string. The third argument is an
|
|
# arrayref of items, same as $feed->{'items'}.
|
|
# </LJFUNC>
|
|
sub parse_feed {
|
|
my ( $content, $type ) = @_;
|
|
my ( $feed, $items, $error );
|
|
my $parser;
|
|
|
|
# is it RSS or Atom?
|
|
# Atom feeds are rare for now, so prefer to err in favor of RSS
|
|
# simple heuristic: Atom feeds will have '<feed' somewhere
|
|
# TODO: maybe store the feed's type on creation in a userprop and not guess here
|
|
|
|
if ( ( defined $type && $type eq 'atom' ) || $content =~ m!\<feed! ) {
|
|
|
|
# try treating it as an atom feed
|
|
$parser = new DW::XML::Parser(
|
|
Style => 'Stream',
|
|
Namespaces => 1,
|
|
Pkg => 'LJ::ParseFeed::Atom'
|
|
);
|
|
return ( "", "failed to create XML parser" ) unless $parser;
|
|
eval { $parser->parse($content); };
|
|
if ($@) {
|
|
$error = "XML parser error: $@";
|
|
}
|
|
else {
|
|
( $feed, $items, $error ) = LJ::ParseFeed::Atom::results();
|
|
}
|
|
|
|
if ( $feed || $type eq 'atom' ) {
|
|
|
|
# there was a top-level <feed> there, or we're forced to treat
|
|
# as an Atom feed, so even if $error is set,
|
|
# don't try RSS
|
|
$feed->{'type'} = 'atom';
|
|
return ( $feed, $error, $items );
|
|
}
|
|
}
|
|
|
|
# try parsing it as RSS
|
|
$parser = new DW::XML::RSS;
|
|
return ( "", "failed to create RSS parser" ) unless $parser;
|
|
|
|
# custom LJ/DW namespaces
|
|
$parser->add_module(
|
|
prefix => 'nslj',
|
|
uri => 'http://www.livejournal.org/rss/lj/1.0/'
|
|
);
|
|
$parser->add_module(
|
|
prefix => 'atom',
|
|
uri => 'http://www.w3.org/2005/Atom'
|
|
);
|
|
|
|
eval { $parser->parse($content); };
|
|
if ($@) {
|
|
$error = "RSS parser error: $@";
|
|
return ( "", $error );
|
|
}
|
|
|
|
$feed = {};
|
|
$feed->{'type'} = 'rss';
|
|
$feed->{'version'} = $parser->{'version'};
|
|
|
|
foreach (qw (link title description)) {
|
|
$feed->{$_} = $parser->{'channel'}->{$_}
|
|
if $parser->{'channel'}->{$_};
|
|
}
|
|
$feed->{'atom:id'} = $parser->{channel}->{atom}->{id} if defined $parser->{channel}->{atom};
|
|
|
|
$feed->{'items'} = [];
|
|
|
|
foreach ( @{ $parser->{'items'} } ) {
|
|
my $item = {};
|
|
$item->{'subject'} = $_->{'title'};
|
|
$item->{'text'} = $_->{'description'};
|
|
$item->{'link'} = $_->{'link'} if $_->{'link'};
|
|
$item->{'id'} = $_->{'guid'} if $_->{'guid'};
|
|
|
|
my $nsenc = 'http://purl.org/rss/1.0/modules/content/';
|
|
if ( $_->{$nsenc} && ref( $_->{$nsenc} ) eq "HASH" ) {
|
|
|
|
# prefer content:encoded if present
|
|
$item->{'text'} = $_->{$nsenc}->{'encoded'}
|
|
if defined $_->{$nsenc}->{'encoded'};
|
|
}
|
|
|
|
my ( $time, $author );
|
|
$time = time822_to_time( $_->{pubDate} ) if $_->{pubDate};
|
|
$author = $_->{nslj}->{poster}
|
|
if $_->{nslj} && ref $_->{nslj} eq "HASH";
|
|
|
|
# Dublin Core
|
|
if ( $_->{dc} && ref $_->{dc} eq "HASH" ) {
|
|
if ( $_->{dc}->{creator} ) {
|
|
my $creator = $_->{dc}->{creator};
|
|
$author = ref $creator eq 'ARRAY' ? join( ', ', @$creator ) : $creator;
|
|
}
|
|
$time = w3cdtf_to_time( $_->{dc}->{date} ) if $_->{dc}->{date};
|
|
}
|
|
|
|
$item->{time} = $time if $time;
|
|
$item->{author} = $author if $author;
|
|
push @{ $feed->{items} }, $item;
|
|
}
|
|
|
|
return ( $feed, undef, $feed->{'items'} );
|
|
}
|
|
|
|
# convert rfc822-time in RSS's <pubDate> to our time
|
|
# see http://www.faqs.org/rfcs/rfc822.html
|
|
# RFC822 specifies 2 digits for year, and RSS2.0 refers to RFC822,
|
|
# but real RSS2.0 feeds apparently use 4 digits.
|
|
sub time822_to_time {
|
|
my $t822 = shift;
|
|
|
|
# remove day name if present
|
|
$t822 =~ s/^\s*\w+\s*,//;
|
|
|
|
# remove whitespace
|
|
$t822 =~ s/^\s*//;
|
|
|
|
# break it up
|
|
if ( $t822 =~ m!(\d?\d)\s+(\w+)\s+(\d\d\d\d)\s+(\d?\d):(\d\d)! ) {
|
|
my ( $day, $mon, $year, $hour, $min ) = ( $1, $2, $3, $4, $5 );
|
|
$day = "0" . $day if length($day) == 1;
|
|
$hour = "0" . $hour if length($hour) == 1;
|
|
$mon = {
|
|
'Jan' => '01',
|
|
'Feb' => '02',
|
|
'Mar' => '03',
|
|
'Apr' => '04',
|
|
'May' => '05',
|
|
'Jun' => '06',
|
|
'Jul' => '07',
|
|
'Aug' => '08',
|
|
'Sep' => '09',
|
|
'Oct' => '10',
|
|
'Nov' => '11',
|
|
'Dec' => '12'
|
|
}->{$mon};
|
|
return undef unless $mon;
|
|
return "$year-$mon-$day $hour:$min";
|
|
}
|
|
else {
|
|
return undef;
|
|
}
|
|
}
|
|
|
|
# convert W3C-DTF to our internal format
|
|
# see http://www.w3.org/TR/NOTE-datetime
|
|
# Based very loosely on code from DateTime::Format::W3CDTF,
|
|
# which isn't stable yet so we can't use it directly.
|
|
sub w3cdtf_to_time {
|
|
my $tw3 = shift;
|
|
|
|
# TODO: Should somehow return the timezone offset
|
|
# so that it can stored... but we don't do timezones
|
|
# yet anyway. For now, just strip the timezone
|
|
# portion if it is present, along with the decimal
|
|
# fractions of a second.
|
|
|
|
$tw3 =~ s/(?:\.\d+)?(?:[+-]\d{1,2}:\d{1,2}|Z)$//;
|
|
$tw3 =~ s/^\s*//;
|
|
$tw3 =~ s/\s*$//; # Eat any superflous whitespace
|
|
|
|
# We can only use complete times, so anything which
|
|
# doesn't feature the time part is considered invalid.
|
|
|
|
# This is working around clients that don't implement W3C-DTF
|
|
# correctly, and only send single digit values in the dates.
|
|
# 2004-4-8T16:9:4Z vs 2004-04-08T16:09:44Z
|
|
# If it's more messed up than that, reject it outright.
|
|
$tw3 =~ /^(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?$/
|
|
or return undef;
|
|
|
|
my %pd; # parsed date
|
|
$pd{Y} = $1;
|
|
$pd{M} = $2;
|
|
$pd{D} = $3;
|
|
$pd{h} = $4;
|
|
$pd{m} = $5;
|
|
$pd{s} = $6;
|
|
|
|
# force double digits
|
|
foreach (qw/ M D h m s /) {
|
|
next unless defined $pd{$_};
|
|
$pd{$_} = sprintf "%02d", $pd{$_};
|
|
}
|
|
|
|
return $pd{s}
|
|
? "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}:$pd{s}"
|
|
: "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}";
|
|
}
|
|
|
|
package LJ::ParseFeed::Atom;
|
|
|
|
our ( $feed, $item, $data );
|
|
our ( $ddepth, $dholder ); # for accumulating;
|
|
our @items;
|
|
our $error;
|
|
|
|
sub err {
|
|
$error = shift unless $error;
|
|
}
|
|
|
|
sub results {
|
|
return ( $feed, \@items, $error );
|
|
}
|
|
|
|
# $name under which we'll store accumulated data may be different
|
|
# from $tag which causes us to store it
|
|
# $name may be a scalarref pointing to where we should store
|
|
# swallowing is achieved by calling startaccum('');
|
|
|
|
sub startaccum {
|
|
my $name = shift;
|
|
|
|
return err ("Tag found under neither <feed> nor <entry>")
|
|
unless $feed || $item;
|
|
$data = ""; # defining $data triggers accumulation
|
|
$ddepth = 1;
|
|
|
|
if ($name) {
|
|
|
|
# if $name is a scalarref, it's actually our $dholder
|
|
if ( ref $name eq 'SCALAR' ) {
|
|
$dholder = $name;
|
|
}
|
|
else {
|
|
$dholder = $item ? \$item->{$name} : \$feed->{$name};
|
|
}
|
|
}
|
|
else {
|
|
$dholder = undef; # no $name
|
|
}
|
|
return;
|
|
}
|
|
|
|
sub swallow {
|
|
return startaccum('');
|
|
}
|
|
|
|
sub StartDocument {
|
|
( $feed, $item, $data ) = ( undef, undef, undef );
|
|
@items = ();
|
|
undef $error;
|
|
}
|
|
|
|
sub StartTag {
|
|
|
|
# $_ carries the unparsed tag
|
|
my ( $p, $tag ) = @_;
|
|
my $holder;
|
|
|
|
# do nothing if there has been an error
|
|
return if $error;
|
|
|
|
# are we just accumulating data?
|
|
if ( defined $data ) {
|
|
$data .= $_;
|
|
$ddepth++;
|
|
return;
|
|
}
|
|
|
|
# where we'll usually store info
|
|
$holder = $item ? $item : $feed;
|
|
|
|
TAGS: {
|
|
if ( $tag eq 'feed' ) {
|
|
return err ("Nested <feed> tags")
|
|
if $feed;
|
|
$feed = {};
|
|
$feed->{'standard'} = 'atom';
|
|
$feed->{'version'} = $_{'version'};
|
|
return err ("Incompatible version specified in <feed>")
|
|
if $feed->{'version'} && $feed->{'version'} < 0.3;
|
|
last TAGS;
|
|
}
|
|
if ( $tag eq 'entry' ) {
|
|
return err ("Nested <entry> tags")
|
|
if $item;
|
|
$item = {};
|
|
last TAGS;
|
|
}
|
|
|
|
# at this point, we must have a top-level <feed> or <entry>
|
|
# to write into
|
|
return err ("Tag found under neither <feed> nor <entry>")
|
|
unless $holder;
|
|
|
|
if ( $tag eq 'link' ) {
|
|
|
|
# store 'self' and 'hub' rels, for PubSubHubbub support; but only valid
|
|
# for the feed, so make sure $item is undef
|
|
if ( !$item && $_{rel} && ( $_{rel} eq 'self' || $_{rel} eq 'hub' ) ) {
|
|
return err ('Feed not yet defined')
|
|
unless $feed;
|
|
|
|
# allow these to be specified multiple times, the spec allows for multiple
|
|
# hubs. the self link shouldn't allow multiples but it won't hurt if we let it.
|
|
push @{ $feed->{ $_{rel} } ||= [] }, $_{href};
|
|
last TAGS;
|
|
}
|
|
|
|
# ignore links with rel= anything but alternate
|
|
# and treat links as rel=alternate if not explicit
|
|
unless ( !$_{'rel'} || $_{'rel'} eq 'alternate' ) {
|
|
swallow();
|
|
last TAGS;
|
|
}
|
|
|
|
# if multiple alternates are specified, prefer the one
|
|
# that doesn't have a type of text/plain.
|
|
# see also t/parsefeed-atom-link2.t
|
|
if ( $holder->{link} && $_{type} && $_{type} eq 'text/plain' ) {
|
|
swallow();
|
|
last TAGS;
|
|
}
|
|
|
|
$holder->{'link'} = $_{'href'};
|
|
return err ("No href attribute in <link>")
|
|
unless $holder->{'link'};
|
|
last TAGS;
|
|
}
|
|
|
|
if ( $tag eq 'content' ) {
|
|
return err ("<content> outside <entry>")
|
|
unless $item;
|
|
|
|
# if type is multipart/alternative, we continue recursing
|
|
# otherwise we accumulate
|
|
my $type = $_{'type'} || "text/plain";
|
|
unless ( $type eq "multipart/alternative" ) {
|
|
push @{ $item->{'contents'} }, [ $type, "" ];
|
|
startaccum( \$item->{'contents'}->[-1]->[1] );
|
|
last TAGS;
|
|
}
|
|
|
|
# it's multipart/alternative, so recurse, but don't swallow
|
|
last TAGS;
|
|
}
|
|
|
|
# we want to store the value of the nested <name> element
|
|
# in the author slot, not accumulate the raw value -
|
|
# use temp key "inauth" to detect the nesting
|
|
|
|
if ( $tag eq 'author' ) {
|
|
$holder->{inauth} = 1;
|
|
last TAGS;
|
|
}
|
|
|
|
if ( $tag eq 'name' ) {
|
|
if ( $holder->{inauth} ) {
|
|
startaccum('author');
|
|
}
|
|
else {
|
|
swallow();
|
|
}
|
|
last TAGS;
|
|
}
|
|
|
|
if ( $tag eq 'poster' ) {
|
|
$holder->{ljposter} = $_{user};
|
|
return err ("No user attribute in <$tag>")
|
|
unless $holder->{ljposter};
|
|
last TAGS;
|
|
}
|
|
|
|
# store tags which should require no further
|
|
# processing as they are, and others under _atom_*, to be processed
|
|
# in EndTag under </entry>
|
|
if ( $tag eq 'title' ) {
|
|
if ($item) { # entry's subject
|
|
startaccum("subject");
|
|
}
|
|
else { # feed's title
|
|
startaccum($tag);
|
|
}
|
|
last TAGS;
|
|
}
|
|
if ( $tag eq 'atom:id' || $tag eq 'id' ) {
|
|
startaccum($tag);
|
|
last TAGS;
|
|
}
|
|
|
|
if ( $tag eq 'tagline' && !$item ) { # feed's tagline, our "description"
|
|
startaccum("description");
|
|
last TAGS;
|
|
}
|
|
|
|
# accumulate and store
|
|
startaccum( "_atom_" . $tag );
|
|
last TAGS;
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
sub EndTag {
|
|
|
|
# $_ carries the unparsed tag
|
|
my ( $p, $tag ) = @_;
|
|
|
|
# do nothing if there has been an error
|
|
return if $error;
|
|
|
|
# are we accumulating data?
|
|
if ( defined $data ) {
|
|
$ddepth--;
|
|
if ( $ddepth == 0 ) { # stop accumulating
|
|
$$dholder = $data
|
|
if $dholder;
|
|
undef $data;
|
|
return;
|
|
}
|
|
$data .= $_;
|
|
return;
|
|
}
|
|
|
|
TAGS: {
|
|
if ( $tag eq 'entry' ) {
|
|
|
|
# finalize item...
|
|
# generate suitable text from $item->{'contents'}
|
|
my $content;
|
|
$item->{'contents'} ||= [];
|
|
unless ( scalar( @{ $item->{'contents'} } ) >= 1 ) {
|
|
|
|
# this item had no <content>
|
|
# maybe it has <summary>? if so, use <summary>
|
|
# TODO: type= or encoding issues here? perhaps unite
|
|
# handling of <summary> with that of <content>?
|
|
if ( $item->{'_atom_summary'} ) {
|
|
$item->{'text'} = $item->{'_atom_summary'};
|
|
delete $item->{'contents'};
|
|
}
|
|
else {
|
|
# nothing to display, so ignore this entry
|
|
undef $item;
|
|
last TAGS;
|
|
}
|
|
}
|
|
|
|
unless ( $item->{'text'} ) { # unless we already have text
|
|
if ( scalar( @{ $item->{'contents'} } ) == 1 ) {
|
|
|
|
# only one <content> section
|
|
$content = $item->{'contents'}->[0];
|
|
}
|
|
else {
|
|
# several <content> section, must choose the best one
|
|
foreach ( @{ $item->{'contents'} } ) {
|
|
if ( $_->[0] eq "application/xhtml+xml" ) { # best match
|
|
$content = $_;
|
|
last; # don't bother to look at others
|
|
}
|
|
if ( $_->[0] =~ m!html! ) { # some kind of html/xhtml/html+xml, etc.
|
|
# choose this unless we've already chosen some html
|
|
$content = $_
|
|
unless $content->[0] =~ m!html!;
|
|
next;
|
|
}
|
|
if ( $_->[0] eq "text/plain" ) {
|
|
|
|
# choose this unless we have some html already
|
|
$content = $_
|
|
unless $content->[0] =~ m!html!;
|
|
next;
|
|
}
|
|
}
|
|
|
|
# if we didn't choose anything, pick the first one
|
|
$content = $item->{'contents'}->[0]
|
|
unless $content;
|
|
}
|
|
|
|
# we ignore the 'mode' attribute of <content>. If it's "xml", we've
|
|
# stringified it by accumulation; if it's "escaped", our parser
|
|
# unescaped it
|
|
# TODO: handle mode=base64?
|
|
|
|
$item->{'text'} = $content->[1];
|
|
delete $item->{'contents'};
|
|
}
|
|
|
|
# generate time
|
|
my $w3time =
|
|
$item->{'_atom_created'}
|
|
|| $item->{'_atom_published'}
|
|
|| $item->{'_atom_modified'}
|
|
|| $item->{'_atom_updated'};
|
|
|
|
my $time;
|
|
if ($w3time) {
|
|
|
|
# see http://www.w3.org/TR/NOTE-datetime for format
|
|
# we insist on having granularity up to a minute,
|
|
# and ignore finer data as well as the timezone, for now
|
|
if ( $w3time =~ m!^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d)! ) {
|
|
$time = "$1-$2-$3 $4:$5";
|
|
}
|
|
}
|
|
$item->{time} = $time if $time;
|
|
|
|
# if we found ljposter, use that as preferred author
|
|
$item->{author} = $item->{ljposter} if defined $item->{ljposter};
|
|
delete $item->{ljposter};
|
|
|
|
# get rid of all other tags we don't need anymore
|
|
foreach ( keys %$item ) {
|
|
delete $item->{$_} if substr( $_, 0, 6 ) eq '_atom_';
|
|
}
|
|
|
|
push @items, $item;
|
|
undef $item;
|
|
last TAGS;
|
|
}
|
|
|
|
if ( $tag eq 'author' ) {
|
|
my $holder = $item ? $item : $feed;
|
|
delete $holder->{inauth};
|
|
last TAGS;
|
|
}
|
|
|
|
if ( $tag eq 'feed' ) {
|
|
|
|
# finalize feed
|
|
|
|
# if feed author exists, all items should default to it
|
|
if ( defined $feed->{author} ) {
|
|
$_->{author} ||= $feed->{author} foreach @items;
|
|
}
|
|
|
|
# get rid of all other tags we don't need anymore
|
|
foreach ( keys %$feed ) {
|
|
delete $feed->{$_} if substr( $_, 0, 6 ) eq '_atom_';
|
|
}
|
|
|
|
# link the feed with its itms
|
|
$feed->{'items'} = \@items
|
|
if $feed;
|
|
last TAGS;
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
sub Text {
|
|
my $p = shift;
|
|
|
|
# do nothing if there has been an error
|
|
return if $error;
|
|
|
|
$data .= $_ if defined $data;
|
|
}
|
|
|
|
sub PI {
|
|
|
|
# ignore processing instructions
|
|
return;
|
|
}
|
|
|
|
sub EndDocument {
|
|
|
|
# if we parsed a feed, link items to it
|
|
$feed->{'items'} = \@items
|
|
if $feed;
|
|
return;
|
|
}
|
|
|
|
1;
|