# This code was forked from the LiveJournal project owned and operated # by Live Journal, Inc. The code has been modified and expanded by # Dreamwidth Studios, LLC. These files were originally licensed under # the terms of the license supplied by Live Journal, Inc, which can # currently be found at: # # http://code.livejournal.org/trac/livejournal/browser/trunk/LICENSE-LiveJournal.txt # # In accordance with the original license, this code and all its # modifications are provided under the GNU General Public License. # A copy of that license can be found in the LICENSE file included as # part of this distribution. package LJ::ParseFeed; use strict; use DW::XML::RSS; use DW::XML::Parser; # # name: LJ::ParseFeed::parse_feed # des: Parses an RSS/Atom feed. # class: # args: content, type? # des-content: Feed content. # des-type: Optional; can be "atom" or "rss". # If type isn't supplied, the function will try to guess it # based on contents. # info: items - An arrayref of item hashes, in the same order they were # in the feed. # Each item contains: link - URL of the item; id - unique identifier (optional); # text - text of the item; subject - subject; # time - in format 'yyyy-mm-dd hh:mm' (optional). # returns: Three arguments: $feed, $error, arrayref of items. # $feed, which is a hash with the following keys: # type - 'atom' or 'rss'; version - version of the feed in its # standard; link - URL of the feed; title - title of the feed; # description - description of the feed. # The second argument returned is $error, which, if defined, is a # human-readable error string. The third argument is an # arrayref of items, same as $feed->{'items'}. # sub parse_feed { my ( $content, $type ) = @_; my ( $feed, $items, $error ); my $parser; # is it RSS or Atom? # Atom feeds are rare for now, so prefer to err in favor of RSS # simple heuristic: Atom feeds will have ' 'Stream', Namespaces => 1, Pkg => 'LJ::ParseFeed::Atom' ); return ( "", "failed to create XML parser" ) unless $parser; eval { $parser->parse($content); }; if ($@) { $error = "XML parser error: $@"; } else { ( $feed, $items, $error ) = LJ::ParseFeed::Atom::results(); } if ( $feed || $type eq 'atom' ) { # there was a top-level there, or we're forced to treat # as an Atom feed, so even if $error is set, # don't try RSS $feed->{'type'} = 'atom'; return ( $feed, $error, $items ); } } # try parsing it as RSS $parser = new DW::XML::RSS; return ( "", "failed to create RSS parser" ) unless $parser; # custom LJ/DW namespaces $parser->add_module( prefix => 'nslj', uri => 'http://www.livejournal.org/rss/lj/1.0/' ); $parser->add_module( prefix => 'atom', uri => 'http://www.w3.org/2005/Atom' ); eval { $parser->parse($content); }; if ($@) { $error = "RSS parser error: $@"; return ( "", $error ); } $feed = {}; $feed->{'type'} = 'rss'; $feed->{'version'} = $parser->{'version'}; foreach (qw (link title description)) { $feed->{$_} = $parser->{'channel'}->{$_} if $parser->{'channel'}->{$_}; } $feed->{'atom:id'} = $parser->{channel}->{atom}->{id} if defined $parser->{channel}->{atom}; $feed->{'items'} = []; foreach ( @{ $parser->{'items'} } ) { my $item = {}; $item->{'subject'} = $_->{'title'}; $item->{'text'} = $_->{'description'}; $item->{'link'} = $_->{'link'} if $_->{'link'}; $item->{'id'} = $_->{'guid'} if $_->{'guid'}; my $nsenc = 'http://purl.org/rss/1.0/modules/content/'; if ( $_->{$nsenc} && ref( $_->{$nsenc} ) eq "HASH" ) { # prefer content:encoded if present $item->{'text'} = $_->{$nsenc}->{'encoded'} if defined $_->{$nsenc}->{'encoded'}; } my ( $time, $author ); $time = time822_to_time( $_->{pubDate} ) if $_->{pubDate}; $author = $_->{nslj}->{poster} if $_->{nslj} && ref $_->{nslj} eq "HASH"; # Dublin Core if ( $_->{dc} && ref $_->{dc} eq "HASH" ) { if ( $_->{dc}->{creator} ) { my $creator = $_->{dc}->{creator}; $author = ref $creator eq 'ARRAY' ? join( ', ', @$creator ) : $creator; } $time = w3cdtf_to_time( $_->{dc}->{date} ) if $_->{dc}->{date}; } $item->{time} = $time if $time; $item->{author} = $author if $author; push @{ $feed->{items} }, $item; } return ( $feed, undef, $feed->{'items'} ); } # convert rfc822-time in RSS's to our time # see http://www.faqs.org/rfcs/rfc822.html # RFC822 specifies 2 digits for year, and RSS2.0 refers to RFC822, # but real RSS2.0 feeds apparently use 4 digits. sub time822_to_time { my $t822 = shift; # remove day name if present $t822 =~ s/^\s*\w+\s*,//; # remove whitespace $t822 =~ s/^\s*//; # break it up if ( $t822 =~ m!(\d?\d)\s+(\w+)\s+(\d\d\d\d)\s+(\d?\d):(\d\d)! ) { my ( $day, $mon, $year, $hour, $min ) = ( $1, $2, $3, $4, $5 ); $day = "0" . $day if length($day) == 1; $hour = "0" . $hour if length($hour) == 1; $mon = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12' }->{$mon}; return undef unless $mon; return "$year-$mon-$day $hour:$min"; } else { return undef; } } # convert W3C-DTF to our internal format # see http://www.w3.org/TR/NOTE-datetime # Based very loosely on code from DateTime::Format::W3CDTF, # which isn't stable yet so we can't use it directly. sub w3cdtf_to_time { my $tw3 = shift; # TODO: Should somehow return the timezone offset # so that it can stored... but we don't do timezones # yet anyway. For now, just strip the timezone # portion if it is present, along with the decimal # fractions of a second. $tw3 =~ s/(?:\.\d+)?(?:[+-]\d{1,2}:\d{1,2}|Z)$//; $tw3 =~ s/^\s*//; $tw3 =~ s/\s*$//; # Eat any superflous whitespace # We can only use complete times, so anything which # doesn't feature the time part is considered invalid. # This is working around clients that don't implement W3C-DTF # correctly, and only send single digit values in the dates. # 2004-4-8T16:9:4Z vs 2004-04-08T16:09:44Z # If it's more messed up than that, reject it outright. $tw3 =~ /^(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?$/ or return undef; my %pd; # parsed date $pd{Y} = $1; $pd{M} = $2; $pd{D} = $3; $pd{h} = $4; $pd{m} = $5; $pd{s} = $6; # force double digits foreach (qw/ M D h m s /) { next unless defined $pd{$_}; $pd{$_} = sprintf "%02d", $pd{$_}; } return $pd{s} ? "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}:$pd{s}" : "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}"; } package LJ::ParseFeed::Atom; our ( $feed, $item, $data ); our ( $ddepth, $dholder ); # for accumulating; our @items; our $error; sub err { $error = shift unless $error; } sub results { return ( $feed, \@items, $error ); } # $name under which we'll store accumulated data may be different # from $tag which causes us to store it # $name may be a scalarref pointing to where we should store # swallowing is achieved by calling startaccum(''); sub startaccum { my $name = shift; return err ("Tag found under neither nor ") unless $feed || $item; $data = ""; # defining $data triggers accumulation $ddepth = 1; if ($name) { # if $name is a scalarref, it's actually our $dholder if ( ref $name eq 'SCALAR' ) { $dholder = $name; } else { $dholder = $item ? \$item->{$name} : \$feed->{$name}; } } else { $dholder = undef; # no $name } return; } sub swallow { return startaccum(''); } sub StartDocument { ( $feed, $item, $data ) = ( undef, undef, undef ); @items = (); undef $error; } sub StartTag { # $_ carries the unparsed tag my ( $p, $tag ) = @_; my $holder; # do nothing if there has been an error return if $error; # are we just accumulating data? if ( defined $data ) { $data .= $_; $ddepth++; return; } # where we'll usually store info $holder = $item ? $item : $feed; TAGS: { if ( $tag eq 'feed' ) { return err ("Nested tags") if $feed; $feed = {}; $feed->{'standard'} = 'atom'; $feed->{'version'} = $_{'version'}; return err ("Incompatible version specified in ") if $feed->{'version'} && $feed->{'version'} < 0.3; last TAGS; } if ( $tag eq 'entry' ) { return err ("Nested tags") if $item; $item = {}; last TAGS; } # at this point, we must have a top-level or # to write into return err ("Tag found under neither nor ") unless $holder; if ( $tag eq 'link' ) { # store 'self' and 'hub' rels, for PubSubHubbub support; but only valid # for the feed, so make sure $item is undef if ( !$item && $_{rel} && ( $_{rel} eq 'self' || $_{rel} eq 'hub' ) ) { return err ('Feed not yet defined') unless $feed; # allow these to be specified multiple times, the spec allows for multiple # hubs. the self link shouldn't allow multiples but it won't hurt if we let it. push @{ $feed->{ $_{rel} } ||= [] }, $_{href}; last TAGS; } # ignore links with rel= anything but alternate # and treat links as rel=alternate if not explicit unless ( !$_{'rel'} || $_{'rel'} eq 'alternate' ) { swallow(); last TAGS; } # if multiple alternates are specified, prefer the one # that doesn't have a type of text/plain. # see also t/parsefeed-atom-link2.t if ( $holder->{link} && $_{type} && $_{type} eq 'text/plain' ) { swallow(); last TAGS; } $holder->{'link'} = $_{'href'}; return err ("No href attribute in ") unless $holder->{'link'}; last TAGS; } if ( $tag eq 'content' ) { return err (" outside ") unless $item; # if type is multipart/alternative, we continue recursing # otherwise we accumulate my $type = $_{'type'} || "text/plain"; unless ( $type eq "multipart/alternative" ) { push @{ $item->{'contents'} }, [ $type, "" ]; startaccum( \$item->{'contents'}->[-1]->[1] ); last TAGS; } # it's multipart/alternative, so recurse, but don't swallow last TAGS; } # we want to store the value of the nested element # in the author slot, not accumulate the raw value - # use temp key "inauth" to detect the nesting if ( $tag eq 'author' ) { $holder->{inauth} = 1; last TAGS; } if ( $tag eq 'name' ) { if ( $holder->{inauth} ) { startaccum('author'); } else { swallow(); } last TAGS; } if ( $tag eq 'poster' ) { $holder->{ljposter} = $_{user}; return err ("No user attribute in <$tag>") unless $holder->{ljposter}; last TAGS; } # store tags which should require no further # processing as they are, and others under _atom_*, to be processed # in EndTag under if ( $tag eq 'title' ) { if ($item) { # entry's subject startaccum("subject"); } else { # feed's title startaccum($tag); } last TAGS; } if ( $tag eq 'atom:id' || $tag eq 'id' ) { startaccum($tag); last TAGS; } if ( $tag eq 'tagline' && !$item ) { # feed's tagline, our "description" startaccum("description"); last TAGS; } # accumulate and store startaccum( "_atom_" . $tag ); last TAGS; } return; } sub EndTag { # $_ carries the unparsed tag my ( $p, $tag ) = @_; # do nothing if there has been an error return if $error; # are we accumulating data? if ( defined $data ) { $ddepth--; if ( $ddepth == 0 ) { # stop accumulating $$dholder = $data if $dholder; undef $data; return; } $data .= $_; return; } TAGS: { if ( $tag eq 'entry' ) { # finalize item... # generate suitable text from $item->{'contents'} my $content; $item->{'contents'} ||= []; unless ( scalar( @{ $item->{'contents'} } ) >= 1 ) { # this item had no # maybe it has ? if so, use # TODO: type= or encoding issues here? perhaps unite # handling of with that of ? if ( $item->{'_atom_summary'} ) { $item->{'text'} = $item->{'_atom_summary'}; delete $item->{'contents'}; } else { # nothing to display, so ignore this entry undef $item; last TAGS; } } unless ( $item->{'text'} ) { # unless we already have text if ( scalar( @{ $item->{'contents'} } ) == 1 ) { # only one section $content = $item->{'contents'}->[0]; } else { # several section, must choose the best one foreach ( @{ $item->{'contents'} } ) { if ( $_->[0] eq "application/xhtml+xml" ) { # best match $content = $_; last; # don't bother to look at others } if ( $_->[0] =~ m!html! ) { # some kind of html/xhtml/html+xml, etc. # choose this unless we've already chosen some html $content = $_ unless $content->[0] =~ m!html!; next; } if ( $_->[0] eq "text/plain" ) { # choose this unless we have some html already $content = $_ unless $content->[0] =~ m!html!; next; } } # if we didn't choose anything, pick the first one $content = $item->{'contents'}->[0] unless $content; } # we ignore the 'mode' attribute of . If it's "xml", we've # stringified it by accumulation; if it's "escaped", our parser # unescaped it # TODO: handle mode=base64? $item->{'text'} = $content->[1]; delete $item->{'contents'}; } # generate time my $w3time = $item->{'_atom_created'} || $item->{'_atom_published'} || $item->{'_atom_modified'} || $item->{'_atom_updated'}; my $time; if ($w3time) { # see http://www.w3.org/TR/NOTE-datetime for format # we insist on having granularity up to a minute, # and ignore finer data as well as the timezone, for now if ( $w3time =~ m!^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d)! ) { $time = "$1-$2-$3 $4:$5"; } } $item->{time} = $time if $time; # if we found ljposter, use that as preferred author $item->{author} = $item->{ljposter} if defined $item->{ljposter}; delete $item->{ljposter}; # get rid of all other tags we don't need anymore foreach ( keys %$item ) { delete $item->{$_} if substr( $_, 0, 6 ) eq '_atom_'; } push @items, $item; undef $item; last TAGS; } if ( $tag eq 'author' ) { my $holder = $item ? $item : $feed; delete $holder->{inauth}; last TAGS; } if ( $tag eq 'feed' ) { # finalize feed # if feed author exists, all items should default to it if ( defined $feed->{author} ) { $_->{author} ||= $feed->{author} foreach @items; } # get rid of all other tags we don't need anymore foreach ( keys %$feed ) { delete $feed->{$_} if substr( $_, 0, 6 ) eq '_atom_'; } # link the feed with its itms $feed->{'items'} = \@items if $feed; last TAGS; } } return; } sub Text { my $p = shift; # do nothing if there has been an error return if $error; $data .= $_ if defined $data; } sub PI { # ignore processing instructions return; } sub EndDocument { # if we parsed a feed, link items to it $feed->{'items'} = \@items if $feed; return; } 1;