2074 lines
77 KiB
Perl
2074 lines
77 KiB
Perl
|
|
# This code was forked from the LiveJournal project owned and operated
|
||
|
|
# by Live Journal, Inc. The code has been modified and expanded by
|
||
|
|
# Dreamwidth Studios, LLC. These files were originally licensed under
|
||
|
|
# the terms of the license supplied by Live Journal, Inc, which can
|
||
|
|
# currently be found at:
|
||
|
|
#
|
||
|
|
# http://code.livejournal.org/trac/livejournal/browser/trunk/LICENSE-LiveJournal.txt
|
||
|
|
#
|
||
|
|
# In accordance with the original license, this code and all its
|
||
|
|
# modifications are provided under the GNU General Public License.
|
||
|
|
# A copy of that license can be found in the LICENSE file included as
|
||
|
|
# part of this distribution.
|
||
|
|
|
||
|
|
package LJ::CleanHTML;
|
||
|
|
use strict;
|
||
|
|
|
||
|
|
use URI;
|
||
|
|
use HTMLCleaner;
|
||
|
|
use LJ::CSS::Cleaner;
|
||
|
|
use HTML::TokeParser;
|
||
|
|
use LJ::EmbedModule;
|
||
|
|
use LJ::Config;
|
||
|
|
use Text::Markdown;
|
||
|
|
use LJ::TextUtil;
|
||
|
|
use DW::Formats;
|
||
|
|
use DW::External::Site;
|
||
|
|
|
||
|
|
LJ::Config->load;
|
||
|
|
|
||
|
|
# attempt to mangle an email address for printing out to HTML. this is
|
||
|
|
# kind of futile, but we try anyway.
|
||
|
|
sub mangle_email_address {
|
||
|
|
my $email = $_[0];
|
||
|
|
$email =~ s!^(.+)@(.+)$!<span>$1</span><span><em>@</em></span>$2!;
|
||
|
|
return $email;
|
||
|
|
}
|
||
|
|
|
||
|
|
# LJ::CleanHTML::clean(\$u->{'bio'}, {
|
||
|
|
# 'addbreaks' => 1, # insert <br/> after newlines where appropriate
|
||
|
|
# 'eat' => [qw(head title style layer iframe)],
|
||
|
|
# 'mode' => 'allow',
|
||
|
|
# 'deny' => [qw(marquee)],
|
||
|
|
# 'remove' => [qw()],
|
||
|
|
# 'maximgwidth' => 100,
|
||
|
|
# 'maximgheight' => 100,
|
||
|
|
# 'keepcomments' => 1,
|
||
|
|
# 'cuturl' => 'http://www.domain.com/full_item_view.ext',
|
||
|
|
# 'ljcut_disable' => 1, # stops the cleaner from using the lj-cut tag
|
||
|
|
# 'cleancss' => 1,
|
||
|
|
# 'extractlinks' => 1, # remove a hrefs; implies noautolinks
|
||
|
|
# 'noautolinks' => 1, # do not auto linkify
|
||
|
|
# 'extractimages' => 1, # placeholder images
|
||
|
|
# 'transform_embed_nocheck' => 1, # do not do checks on object/embed tag transforming
|
||
|
|
# 'transform_embed_wmode' => <value>, # define a wmode value for videos (usually 'transparent' is the value you want)
|
||
|
|
# 'blocked_links' => [ qr/evil\.com/, qw/spammer\.com/ ], # list of sites which URL's will be blocked
|
||
|
|
# 'blocked_link_substitute' => 'http://domain.com/error.html' # blocked links will be replaced by this URL
|
||
|
|
# 'to_external_site' => 0, # flag for when the content is going to be fed to external sites, so it can be special-cased. e.g., feeds
|
||
|
|
# });
|
||
|
|
|
||
|
|
sub helper_preload {
|
||
|
|
my $p = HTML::TokeParser->new("");
|
||
|
|
eval { $p->DESTROY(); };
|
||
|
|
}
|
||
|
|
|
||
|
|
# this treats normal characters and &entities; as single characters
|
||
|
|
# also treats UTF-8 chars as single characters
|
||
|
|
my $onechar;
|
||
|
|
{
|
||
|
|
my $utf_longchar =
|
||
|
|
'[\xc2-\xdf][\x80-\xbf]|\xe0[\xa0-\xbf][\x80-\xbf]|[\xe1-\xef][\x80-\xbf][\x80-\xbf]|\xf0[\x90-\xbf][\x80-\xbf][\x80-\xbf]|[\xf1-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]';
|
||
|
|
my $match = $utf_longchar . '|[^&\s\x80-\xff]|(?:&\#?\w{1,7};)';
|
||
|
|
$onechar = qr/$match/o;
|
||
|
|
}
|
||
|
|
|
||
|
|
# In XHTML you can close a tag in the same opening tag like <br />,
|
||
|
|
# but some browsers still will interpret it as an opening only tag.
|
||
|
|
# This is a list of tags which you can actually close with a trailing
|
||
|
|
# slash and get the proper behavior from a browser.
|
||
|
|
#
|
||
|
|
# In HTML5 these are called "void elements".
|
||
|
|
my $slashclose_tags =
|
||
|
|
qr/^(?:area|base|basefont|br|col|embed|frame|hr|img|input|isindex|link|meta|param|source|track|wbr|lj-embed|site-embed|poll-\d+|lj-poll-\d+)$/i;
|
||
|
|
|
||
|
|
# <LJFUNC>
|
||
|
|
# name: LJ::CleanHTML::clean
|
||
|
|
# class: text
|
||
|
|
# des: Multi-faceted HTML parse function
|
||
|
|
# info:
|
||
|
|
# args: data, opts
|
||
|
|
# des-data: A reference to HTML to parse to output, or HTML if modified in-place.
|
||
|
|
# des-opts: An hash of options to pass to the parser.
|
||
|
|
# returns: Nothing.
|
||
|
|
# </LJFUNC>
|
||
|
|
sub clean {
|
||
|
|
my $data = shift;
|
||
|
|
return undef unless defined $$data;
|
||
|
|
|
||
|
|
my $opts = shift;
|
||
|
|
|
||
|
|
# this has to be an empty string because otherwise we might never actually append
|
||
|
|
# anything to it if $$data contains only invalid content
|
||
|
|
my $newdata = '';
|
||
|
|
|
||
|
|
# Set up configuration and defaults:
|
||
|
|
my $addbreaks = $opts->{addbreaks}; # \n -> <br>
|
||
|
|
my $keepcomments = $opts->{keepcomments};
|
||
|
|
my $mode = $opts->{mode};
|
||
|
|
my $nodwtags = $opts->{nodwtags} || 0; # Disable all special DW/LJ tags
|
||
|
|
my $cut = $opts->{cuturl} || $opts->{cutpreview};
|
||
|
|
my $ljcut_disable = $opts->{ljcut_disable};
|
||
|
|
my $extractlinks = 0 || $opts->{extractlinks}; # Links become `<b>text</b> (url)`
|
||
|
|
my $noexpand_embedded = $opts->{noexpandembedded} || $opts->{textonly} || 0;
|
||
|
|
my $transform_embed_nocheck = $opts->{transform_embed_nocheck} || 0;
|
||
|
|
my $transform_embed_wmode = $opts->{transform_embed_wmode};
|
||
|
|
my $rewrite_embed_param = $opts->{rewrite_embed_param} || 0;
|
||
|
|
my $remove_colors = $opts->{remove_colors} || 0;
|
||
|
|
my $remove_sizes = $opts->{remove_sizes} || 0;
|
||
|
|
my $remove_abs_sizes = $opts->{remove_abs_sizes} || 0;
|
||
|
|
my $remove_fonts = $opts->{remove_fonts} || 0;
|
||
|
|
my $at_mentions = $opts->{at_mentions} || 0; # @person.place -> user tag
|
||
|
|
my $formatting = $opts->{formatting} // 'html'; # html, or do we need to convert?
|
||
|
|
my $auto_links = !( $extractlinks || $opts->{noautolinks} );
|
||
|
|
|
||
|
|
$auto_links = 0 if $formatting ne 'html';
|
||
|
|
$cut = 0 if $nodwtags;
|
||
|
|
$at_mentions = 0 if $nodwtags;
|
||
|
|
|
||
|
|
my $blocked_links =
|
||
|
|
( exists $opts->{'blocked_links'} ) ? $opts->{'blocked_links'} : \@LJ::BLOCKED_LINKS;
|
||
|
|
my $blocked_link_substitute =
|
||
|
|
( exists $opts->{'blocked_link_substitute'} ) ? $opts->{'blocked_link_substitute'}
|
||
|
|
: ($LJ::BLOCKED_LINK_SUBSTITUTE) ? $LJ::BLOCKED_LINK_SUBSTITUTE
|
||
|
|
: '#';
|
||
|
|
my $suspend_msg = $opts->{'suspend_msg'} || 0;
|
||
|
|
my $to_external_site = $opts->{to_external_site} || 0;
|
||
|
|
my $preserve_lj_tags_for = $opts->{preserve_lj_tags_for} || 0; # False or site name
|
||
|
|
my $remove_positioning = $opts->{'remove_positioning'} || 0;
|
||
|
|
my $errref = $opts->{errref};
|
||
|
|
my $verbose_err = $opts->{verbose_err}; # Verbose parse errors
|
||
|
|
my @unclosed_tags;
|
||
|
|
|
||
|
|
# for ajax cut tag parsing
|
||
|
|
my $cut_retrieve = $opts->{cut_retrieve} || 0;
|
||
|
|
my $journal = $opts->{journal} || "";
|
||
|
|
my $ditemid = $opts->{ditemid} || "";
|
||
|
|
|
||
|
|
my %action = ();
|
||
|
|
my %remove = ();
|
||
|
|
if ( ref $opts->{'allow'} eq "ARRAY" ) {
|
||
|
|
foreach ( @{ $opts->{'allow'} } ) { $action{$_} = "allow"; }
|
||
|
|
}
|
||
|
|
if ( ref $opts->{'eat'} eq "ARRAY" ) {
|
||
|
|
foreach ( @{ $opts->{'eat'} } ) { $action{$_} = "eat"; }
|
||
|
|
}
|
||
|
|
if ( ref $opts->{'deny'} eq "ARRAY" ) {
|
||
|
|
foreach ( @{ $opts->{'deny'} } ) { $action{$_} = "deny"; }
|
||
|
|
}
|
||
|
|
if ( ref $opts->{'remove'} eq "ARRAY" ) {
|
||
|
|
foreach ( @{ $opts->{'remove'} } ) { $action{$_} = "deny"; $remove{$_} = 1; }
|
||
|
|
}
|
||
|
|
if ( ref $opts->{'conditional'} eq "ARRAY" ) {
|
||
|
|
foreach ( @{ $opts->{'conditional'} } ) { $action{$_} = "conditional"; }
|
||
|
|
}
|
||
|
|
|
||
|
|
$action{'script'} = "eat";
|
||
|
|
|
||
|
|
# if removing sizes, remove heading tags
|
||
|
|
if ($remove_sizes) {
|
||
|
|
foreach my $tag (qw( h1 h2 h3 h4 h5 h6 )) {
|
||
|
|
$action{$tag} = "deny";
|
||
|
|
$remove{$tag} = 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $opts->{'strongcleancss'} ) {
|
||
|
|
$opts->{'cleancss'} = 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
my @attrstrip = qw();
|
||
|
|
|
||
|
|
# cleancss means clean annoying css
|
||
|
|
# clean_js_css means clean javascript from css
|
||
|
|
if ( $opts->{'cleancss'} ) {
|
||
|
|
push @attrstrip, 'id';
|
||
|
|
$opts->{'clean_js_css'} = 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $opts->{'nocss'} ) {
|
||
|
|
push @attrstrip, 'style';
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( ref $opts->{'attrstrip'} eq "ARRAY" ) {
|
||
|
|
foreach ( @{ $opts->{'attrstrip'} } ) { push @attrstrip, $_; }
|
||
|
|
}
|
||
|
|
|
||
|
|
# Do some preprocessing of the input text before we try to parse it as HTML:
|
||
|
|
# First, remove the auth portion of any see_request links
|
||
|
|
$$data = LJ::strip_request_auth($$data);
|
||
|
|
|
||
|
|
# Second, convert Markdown; from here on, we can process it as raw HTML (no autoformatting)
|
||
|
|
if ( $formatting eq 'markdown' ) {
|
||
|
|
$$data = Text::Markdown::markdown($$data);
|
||
|
|
$addbreaks = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
# Create the HTML parser we'll use to navigate the text from here on out:
|
||
|
|
my $p = HTML::TokeParser->new($data);
|
||
|
|
|
||
|
|
# Set up state variables:
|
||
|
|
my @canonical_urls; # extracted links
|
||
|
|
|
||
|
|
my %opencount = map { $_ => 0 } qw(td th);
|
||
|
|
my @tablescope = ();
|
||
|
|
|
||
|
|
my $cutcount = 0;
|
||
|
|
|
||
|
|
# bytes known good. set this BEFORE we start parsing any new
|
||
|
|
# start tag, where most evil is (because where attributes can be)
|
||
|
|
# then, if we have to totally fail, we can cut stuff off after this.
|
||
|
|
my $good_until = 0;
|
||
|
|
|
||
|
|
# then, if we decide that part of an entry has invalid content, we'll
|
||
|
|
# escape that part and stuff it in here. this lets us finish cleaning
|
||
|
|
# the "good" part of the entry (since some tags might not get closed
|
||
|
|
# till after $good_until bytes into the text).
|
||
|
|
my $extra_text;
|
||
|
|
my $total_fail = sub {
|
||
|
|
my ( $cuturl, $tag ) = @_;
|
||
|
|
$tag = LJ::ehtml($tag);
|
||
|
|
my $err_str;
|
||
|
|
|
||
|
|
my $edata = LJ::ehtml($$data);
|
||
|
|
$edata =~ s/\r?\n/<br \/>/g if $addbreaks;
|
||
|
|
|
||
|
|
if ($cuturl) {
|
||
|
|
my $cutlink = LJ::ehtml($cuturl);
|
||
|
|
$err_str = '.error.markup';
|
||
|
|
$extra_text =
|
||
|
|
"<strong>"
|
||
|
|
. LJ::Lang::ml( 'cleanhtml.error.markup', { aopts => "href='$cutlink'" } )
|
||
|
|
. "</strong>";
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
$err_str = { error => '.error.markup.extra', opts => { aopts => $tag } };
|
||
|
|
$extra_text =
|
||
|
|
LJ::Lang::ml( 'cleanhtml.error.markup.extra', { aopts => $tag } )
|
||
|
|
. "<br /><br />"
|
||
|
|
. '<div style="width: 95%; overflow: auto">'
|
||
|
|
. $edata
|
||
|
|
. '</div>';
|
||
|
|
}
|
||
|
|
|
||
|
|
$extra_text = "<div class='ljparseerror'>$extra_text</div>";
|
||
|
|
$$verbose_err = $err_str if $verbose_err;
|
||
|
|
$$errref = "parseerror" if $errref;
|
||
|
|
|
||
|
|
};
|
||
|
|
|
||
|
|
my $htmlcleaner = HTMLCleaner->new( valid_stylesheet => \&LJ::valid_stylesheet_url );
|
||
|
|
|
||
|
|
my $eating_ljuser_span = 0; # bool, if we're eating an ljuser span
|
||
|
|
my $ljuser_text_node = ""; # the last text node we saw while eating ljuser tags
|
||
|
|
my @eatuntil = (); # if non-empty, we're eating everything. thing at end is thing
|
||
|
|
# we're looking to open again or close again.
|
||
|
|
|
||
|
|
my $capturing_during_eat; # if we save all tokens that happen inside the eating.
|
||
|
|
my @capture = (); # if so, they go here
|
||
|
|
|
||
|
|
my @tagstack = (); # so we can make sure that tags are closed properly/in order
|
||
|
|
|
||
|
|
my $disable_user_conversion = 0;
|
||
|
|
|
||
|
|
my $form_tag = {
|
||
|
|
input => 1,
|
||
|
|
select => 1,
|
||
|
|
option => 1,
|
||
|
|
};
|
||
|
|
|
||
|
|
my $start_capture = sub {
|
||
|
|
next if $capturing_during_eat;
|
||
|
|
|
||
|
|
my ( $tag, $first_token, $cb ) = @_;
|
||
|
|
push @eatuntil, $tag;
|
||
|
|
@capture = ($first_token);
|
||
|
|
$capturing_during_eat = $cb || sub { };
|
||
|
|
};
|
||
|
|
|
||
|
|
my $finish_capture = sub {
|
||
|
|
@capture = ();
|
||
|
|
$capturing_during_eat = undef;
|
||
|
|
};
|
||
|
|
|
||
|
|
# we now allow users to use new tags that aren't "lj" tags. this short
|
||
|
|
# stub allows us to "upgrade" the tag.
|
||
|
|
my $tag_updates = {
|
||
|
|
'cut' => 'lj-cut',
|
||
|
|
'poll' => 'lj-poll',
|
||
|
|
'poll-item' => 'lj-pi',
|
||
|
|
'poll-question' => 'lj-pq',
|
||
|
|
'raw-code' => 'lj-raw',
|
||
|
|
'site-embed' => 'lj-embed',
|
||
|
|
'user' => 'lj',
|
||
|
|
};
|
||
|
|
my $update_tag = sub {
|
||
|
|
return $tag_updates->{ $_[0] } || $_[0];
|
||
|
|
};
|
||
|
|
|
||
|
|
my $usertag_opts = {
|
||
|
|
textonly => $opts->{textonly} ? 1 : 0,
|
||
|
|
preserve_lj_tags_for => $opts->{preserve_lj_tags_for} || 0,
|
||
|
|
no_ljuser_class => $opts->{to_external_site} ? 1 : 0,
|
||
|
|
no_link => 0,
|
||
|
|
};
|
||
|
|
|
||
|
|
# if we're retrieving a cut tag, then we want to eat everything
|
||
|
|
# until we hit the first cut tag.
|
||
|
|
my @cuttag_stack = ();
|
||
|
|
my $eatall = $cut_retrieve ? 1 : 0;
|
||
|
|
|
||
|
|
TOKEN:
|
||
|
|
while ( my $token = $p->get_token ) {
|
||
|
|
my $type = $token->[0];
|
||
|
|
|
||
|
|
$usertag_opts->{no_link} = $opencount{'a'} ? 1 : 0;
|
||
|
|
|
||
|
|
if ( $type eq "S" ) # start tag
|
||
|
|
{
|
||
|
|
my $tag = $update_tag->( $token->[1] );
|
||
|
|
my $attr = $token->[2]; # hashref
|
||
|
|
my $ljcut_div =
|
||
|
|
$tag eq "div" && defined lc $attr->{class} && lc $attr->{class} eq "ljcut";
|
||
|
|
|
||
|
|
$good_until = length $newdata;
|
||
|
|
|
||
|
|
if (@eatuntil) {
|
||
|
|
push @capture, $token if $capturing_during_eat;
|
||
|
|
|
||
|
|
# have to keep the cut counts consistent even if they're nested
|
||
|
|
if ( $tag eq "lj-cut" || $ljcut_div ) {
|
||
|
|
$cutcount++;
|
||
|
|
}
|
||
|
|
if ( $tag eq $eatuntil[-1] ) {
|
||
|
|
push @eatuntil, $tag;
|
||
|
|
}
|
||
|
|
next TOKEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
# if we're looking for cut tags, ignore everything that's
|
||
|
|
# not a cut tag.
|
||
|
|
if ( $eatall && $tag ne "lj-cut" && !$ljcut_div ) {
|
||
|
|
next TOKEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $tag eq "lj-template" && !$noexpand_embedded && !$nodwtags ) {
|
||
|
|
my $name = $attr->{name} || "";
|
||
|
|
$name =~ s/-/_/g;
|
||
|
|
|
||
|
|
my $run_template_hook = sub {
|
||
|
|
|
||
|
|
# deprecated - will always print an error msg (see #1869)
|
||
|
|
$newdata .=
|
||
|
|
"<strong>"
|
||
|
|
. LJ::Lang::ml( 'cleanhtml.error.template', { aopts => LJ::ehtml($name) } )
|
||
|
|
. "</strong>";
|
||
|
|
};
|
||
|
|
|
||
|
|
if ( $attr->{'/'} ) {
|
||
|
|
|
||
|
|
# template is self-closing, no need to do capture
|
||
|
|
$run_template_hook->( $token, 1 );
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
# capture and send content to hook
|
||
|
|
$start_capture->( "lj-template", $token, $run_template_hook );
|
||
|
|
}
|
||
|
|
next TOKEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
# Capture object and embed tags to possibly transform them into something else.
|
||
|
|
if ( $tag eq "object" || $tag eq "embed" ) {
|
||
|
|
if ( LJ::Hooks::are_hooks("transform_embed") && !$noexpand_embedded ) {
|
||
|
|
|
||
|
|
# XHTML style open/close tags done as a singleton shouldn't actually
|
||
|
|
# start a capture loop, because there won't be a close tag.
|
||
|
|
if ( $attr->{'/'} ) {
|
||
|
|
$newdata .= LJ::Hooks::run_hook(
|
||
|
|
"transform_embed", [$token],
|
||
|
|
nocheck => $transform_embed_nocheck,
|
||
|
|
wmode => $transform_embed_wmode
|
||
|
|
) || "";
|
||
|
|
next TOKEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
$start_capture->(
|
||
|
|
$tag, $token,
|
||
|
|
sub {
|
||
|
|
my $expanded = LJ::Hooks::run_hook(
|
||
|
|
"transform_embed", \@capture,
|
||
|
|
nocheck => $transform_embed_nocheck,
|
||
|
|
wmode => $transform_embed_wmode
|
||
|
|
);
|
||
|
|
$newdata .= $expanded || "";
|
||
|
|
}
|
||
|
|
);
|
||
|
|
next TOKEN;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $tag eq "embed" && $rewrite_embed_param ) {
|
||
|
|
$attr->{allowscriptaccess} = "sameDomain"
|
||
|
|
if exists $attr->{allowscriptaccess} && $attr->{allowscriptaccess} ne 'never';
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $tag eq "param"
|
||
|
|
&& $rewrite_embed_param
|
||
|
|
&& $opencount{object}
|
||
|
|
&& lc( $attr->{name} ) eq 'allowscriptaccess' )
|
||
|
|
{
|
||
|
|
$attr->{value} = "sameDomain" if $attr->{value} ne 'never';
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $tag eq "span"
|
||
|
|
&& lc $attr->{class} eq "ljuser"
|
||
|
|
&& !$noexpand_embedded
|
||
|
|
&& !$nodwtags )
|
||
|
|
{
|
||
|
|
$eating_ljuser_span = 1;
|
||
|
|
$ljuser_text_node = "";
|
||
|
|
}
|
||
|
|
|
||
|
|
if ($eating_ljuser_span) {
|
||
|
|
next TOKEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
# deprecated - will always print an error msg (see #1869)
|
||
|
|
if ( ( $tag eq "div" || $tag eq "span" )
|
||
|
|
&& defined $attr->{class}
|
||
|
|
&& lc $attr->{class} eq "ljvideo" )
|
||
|
|
{
|
||
|
|
$start_capture->(
|
||
|
|
$tag, $token,
|
||
|
|
sub {
|
||
|
|
$newdata .=
|
||
|
|
"<strong>"
|
||
|
|
. LJ::Lang::ml('cleanhtml.error.template.video')
|
||
|
|
. "</strong>";
|
||
|
|
}
|
||
|
|
);
|
||
|
|
next TOKEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
# do some quick checking to see if this is an email address/URL, and if so, just
|
||
|
|
# escape it and ignore it
|
||
|
|
if ( $tag =~ m!(?:\@|://)! ) {
|
||
|
|
$newdata .= LJ::ehtml("<$tag>");
|
||
|
|
next;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $form_tag->{$tag} ) {
|
||
|
|
if ( !$opencount{form} ) {
|
||
|
|
$newdata .= "<$tag ... >";
|
||
|
|
next;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $tag eq "input" ) {
|
||
|
|
if ( $attr->{type} !~ /^\w+$/ || lc $attr->{type} eq "password" ) {
|
||
|
|
delete $attr->{type};
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
my $slashclose = 0; # If set to 1, use XML-style empty tag marker
|
||
|
|
# for tags like <name/>, pretend it's <name> and reinsert the slash later
|
||
|
|
$slashclose = 1 if ( $tag =~ s!/$!! );
|
||
|
|
|
||
|
|
unless ( $tag =~ /^\w([\w\-:_]*\w)?$/ ) {
|
||
|
|
$total_fail->( $cut, $tag );
|
||
|
|
last TOKEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
# for incorrect tags like <name/attrib=val> (note the lack of a space)
|
||
|
|
# delete everything after 'name' to prevent a security loophole which happens
|
||
|
|
# because IE understands them.
|
||
|
|
$tag =~ s!/.+$!!;
|
||
|
|
|
||
|
|
if ( defined $action{$tag} and $action{$tag} eq "eat" ) {
|
||
|
|
$p->unget_token($token);
|
||
|
|
$p->get_tag("/$tag");
|
||
|
|
next;
|
||
|
|
}
|
||
|
|
|
||
|
|
# force this specific instance of the tag to be allowed (for conditional)
|
||
|
|
my $force_allow = 0;
|
||
|
|
|
||
|
|
if ( defined $action{$tag} and $action{$tag} eq "conditional" ) {
|
||
|
|
if ( $tag eq "iframe" ) {
|
||
|
|
my $can_https;
|
||
|
|
( $force_allow, $can_https ) =
|
||
|
|
LJ::Hooks::run_hook( 'allow_iframe_embeds', $attr->{src} );
|
||
|
|
$attr->{src} =~ s!^https?:!!
|
||
|
|
if $opts->{force_https_embed}
|
||
|
|
&& $can_https; # convert to protocol-relative URL
|
||
|
|
unless ($force_allow) {
|
||
|
|
## eat this tag
|
||
|
|
if ( !$attr->{'/'} ) {
|
||
|
|
## if not autoclosed tag (<iframe />),
|
||
|
|
## then skip everything till the closing tag
|
||
|
|
$p->get_tag("/iframe");
|
||
|
|
}
|
||
|
|
next TOKEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
# remove the name, because it can be targetted by links
|
||
|
|
delete $attr->{name};
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# try to call HTMLCleaner's element-specific cleaner on this open tag
|
||
|
|
my $clean_res = eval {
|
||
|
|
my $cleantag = $tag;
|
||
|
|
$cleantag =~ s/^.*://s;
|
||
|
|
$cleantag =~ s/[^\w]//g;
|
||
|
|
no strict 'subs';
|
||
|
|
my $meth = "CLEAN_$cleantag";
|
||
|
|
my $seq = $token->[3]; # attribute names, listref
|
||
|
|
my $code = $htmlcleaner->can($meth)
|
||
|
|
or return 1;
|
||
|
|
return $code->( $htmlcleaner, $seq, $attr );
|
||
|
|
};
|
||
|
|
next if !$@ && !$clean_res;
|
||
|
|
|
||
|
|
# this is so the rte converts its source to the standard ljuser html
|
||
|
|
my $ljuser_div =
|
||
|
|
defined $tag
|
||
|
|
&& $tag eq "div"
|
||
|
|
&& defined $attr->{class}
|
||
|
|
&& $attr->{class} eq "ljuser";
|
||
|
|
if ( $ljuser_div && !$nodwtags ) {
|
||
|
|
my $ljuser_text = $p->get_text("/b");
|
||
|
|
$p->get_tag("/div");
|
||
|
|
$ljuser_text =~ s/\[ # opening bracket
|
||
|
|
[^]]+ # everything up to the closing bracket...
|
||
|
|
\] # closing bracket
|
||
|
|
//x;
|
||
|
|
$tag = "lj";
|
||
|
|
$attr->{'user'} = $ljuser_text;
|
||
|
|
}
|
||
|
|
|
||
|
|
# stupid hack to remove the class='ljcut' from divs when we're
|
||
|
|
# disabling them, so we account for the open div normally later.
|
||
|
|
if ( $ljcut_div && $ljcut_disable ) {
|
||
|
|
$ljcut_div = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
# Hack: Twitter uses @-syntax to refer to users, so we want to set
|
||
|
|
# a flag that says we're in a space where we shouldn't embed.
|
||
|
|
if ( $tag eq 'blockquote' && $attr->{class} eq 'twitter-tweet' ) {
|
||
|
|
$disable_user_conversion = 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( ( $tag eq "lj-cut" || $ljcut_div ) && !$nodwtags ) {
|
||
|
|
|
||
|
|
# Here's the things can happen with cut tags:
|
||
|
|
# - User enabled a "don't cut" setting. Do nothing.
|
||
|
|
# - Recent/reading/etc. Cut it, link to $opts->{cuturl} + anchor,
|
||
|
|
# add markup for JS cut expander.
|
||
|
|
# - Entry page. Add numbered anchor and continue.
|
||
|
|
# - Moderation queue ($opts->{cutpreview}). Don't cut; add mock
|
||
|
|
# "read more"s and "</cut>"s.
|
||
|
|
# - RPC for cut expander. Keep ONLY the specified cut.
|
||
|
|
# - Crosspost. Keep literal "<lj-cut>"s. At end, close dangling cuts.
|
||
|
|
|
||
|
|
next TOKEN if $ljcut_disable;
|
||
|
|
$cutcount++;
|
||
|
|
|
||
|
|
# if this is the cut tag we're looking for, then push it
|
||
|
|
# onto the stack (in case there are nested cut tags) and
|
||
|
|
# start including the content.
|
||
|
|
if ($eatall) {
|
||
|
|
if ( $cutcount == $cut_retrieve ) {
|
||
|
|
$eatall = 0;
|
||
|
|
push @cuttag_stack, $tag;
|
||
|
|
}
|
||
|
|
next TOKEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
my $link_text = sub {
|
||
|
|
my $text = "Read more...";
|
||
|
|
if ( $attr->{'text'} ) {
|
||
|
|
$text = $attr->{'text'};
|
||
|
|
if ( $text =~ /[^\x01-\x7f]/ ) {
|
||
|
|
$text = LJ::no_utf8_flag($text);
|
||
|
|
}
|
||
|
|
$text =~ s/</</g;
|
||
|
|
$text =~ s/>/>/g;
|
||
|
|
}
|
||
|
|
return $text;
|
||
|
|
};
|
||
|
|
if ( $opts->{preserve_lj_tags_for} ) {
|
||
|
|
$opencount{'lj-cut'}++;
|
||
|
|
$newdata .= qq{<lj-cut};
|
||
|
|
if ( $attr->{'text'} ) {
|
||
|
|
my $etext = $link_text->();
|
||
|
|
$newdata .= qq{ text="$etext"};
|
||
|
|
}
|
||
|
|
$newdata .= '>';
|
||
|
|
next TOKEN;
|
||
|
|
}
|
||
|
|
elsif ($cut) {
|
||
|
|
my $etext = $link_text->();
|
||
|
|
my $url = LJ::ehtml($cut);
|
||
|
|
$newdata .= "<div>" if $tag eq "div";
|
||
|
|
|
||
|
|
# include empty span and div to be filled in on page
|
||
|
|
# load if javascript is enabled
|
||
|
|
$newdata .=
|
||
|
|
"<span class=\"cut-wrapper\">"
|
||
|
|
. "<span style=\"display: none;\" id=\"span-cuttag_"
|
||
|
|
. $journal . "_"
|
||
|
|
. $ditemid . "_"
|
||
|
|
. $cutcount
|
||
|
|
. "\" class=\"cuttag\"></span>";
|
||
|
|
$newdata .=
|
||
|
|
"<b class=\"cut-open\">( </b><b class=\"cut-text\">"
|
||
|
|
. "<a href=\"$url#cutid$cutcount\">$etext</a>"
|
||
|
|
. "</b><b class=\"cut-close\"> )</b></span>";
|
||
|
|
$newdata .=
|
||
|
|
"<div style=\"display: none;\" id=\"div-cuttag_"
|
||
|
|
. $journal . "_"
|
||
|
|
. $ditemid . "_"
|
||
|
|
. $cutcount
|
||
|
|
. "\" aria-live=\"assertive\">";
|
||
|
|
$newdata .= "</div>";
|
||
|
|
$newdata .= "</div>" if $tag eq "div";
|
||
|
|
|
||
|
|
unless ( $opts->{'cutpreview'} ) {
|
||
|
|
push @eatuntil, $tag;
|
||
|
|
next TOKEN;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
$newdata .= "<a name=\"cutid$cutcount\"></a>" unless $opts->{'textonly'};
|
||
|
|
if ( $tag eq "div" && !$opts->{'textonly'} ) {
|
||
|
|
$opencount{"div"}++;
|
||
|
|
my $etext = $link_text->();
|
||
|
|
$newdata .= "<div class=\"ljcut\" text=\"$etext\">";
|
||
|
|
}
|
||
|
|
next;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
elsif ( $tag eq "style" ) {
|
||
|
|
my $style = $p->get_text("/style");
|
||
|
|
$p->get_tag("/style");
|
||
|
|
if ( LJ::is_enabled('css_cleaner') ) {
|
||
|
|
my $cleaner = LJ::CSS::Cleaner->new;
|
||
|
|
$style = $cleaner->clean($style);
|
||
|
|
LJ::Hooks::run_hook( 'css_cleaner_transform', \$style );
|
||
|
|
if ($LJ::IS_DEV_SERVER) {
|
||
|
|
$style = "/* cleaned */\n" . $style;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
$newdata .= "\n<style>\n$style</style>\n";
|
||
|
|
next;
|
||
|
|
}
|
||
|
|
elsif ( $tag eq "lj" && !$nodwtags ) {
|
||
|
|
|
||
|
|
# keep <lj comm> working for backwards compatibility, but pretend
|
||
|
|
# it was <lj user> so we don't have to account for it below.
|
||
|
|
my $user = $attr->{user} =
|
||
|
|
exists $attr->{name} ? $attr->{name}
|
||
|
|
: exists $attr->{user} ? $attr->{user}
|
||
|
|
: exists $attr->{comm} ? $attr->{comm}
|
||
|
|
: undef;
|
||
|
|
|
||
|
|
$newdata .= user_link_html( $user, $attr->{site}, $usertag_opts );
|
||
|
|
}
|
||
|
|
elsif ( $tag eq "lj-raw" && !$nodwtags ) {
|
||
|
|
|
||
|
|
# Strip it out, but still register it as being open
|
||
|
|
$opencount{$tag}++;
|
||
|
|
}
|
||
|
|
|
||
|
|
# Don't allow any tag with the "set" attribute
|
||
|
|
elsif ( $tag =~ m/:set$/ ) {
|
||
|
|
next;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
my $alt_output = 0;
|
||
|
|
|
||
|
|
my $hash = $token->[2];
|
||
|
|
my $attrs = $token->[3]; # attribute names, in original order
|
||
|
|
|
||
|
|
$slashclose = 1 if delete $hash->{'/'};
|
||
|
|
|
||
|
|
foreach (@attrstrip) {
|
||
|
|
|
||
|
|
# maybe there's a better place for this?
|
||
|
|
next if ( lc $tag eq 'lj-embed' && lc $_ eq 'id' );
|
||
|
|
delete $hash->{$_};
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $tag eq "form" ) {
|
||
|
|
my $action = lc( $hash->{'action'} );
|
||
|
|
my $deny = 0;
|
||
|
|
if ( $action =~ m!^https?://?([^/]+)! ) {
|
||
|
|
my $host = $1;
|
||
|
|
$deny = 1
|
||
|
|
if $host =~ /[%\@\s]/
|
||
|
|
|| $LJ::FORM_DOMAIN_BANNED{$host};
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
$deny = 1;
|
||
|
|
}
|
||
|
|
delete $hash->{'action'} if $deny;
|
||
|
|
}
|
||
|
|
|
||
|
|
ATTR:
|
||
|
|
foreach my $attr ( keys %$hash ) {
|
||
|
|
if ( $attr =~ /^(?:on|dynsrc)/ ) {
|
||
|
|
delete $hash->{$attr};
|
||
|
|
next;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $attr eq "data" ) {
|
||
|
|
delete $hash->{$attr};
|
||
|
|
|
||
|
|
# type specifies the content type for the data specified by "data"
|
||
|
|
# without the data, this has no useful effect
|
||
|
|
# but may cause the object tag not to use the fallback values in Firefox
|
||
|
|
delete $hash->{"type"};
|
||
|
|
next;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $attr =~ /(?:^=)|[\x0b\x0d]/ ) {
|
||
|
|
|
||
|
|
# Cleaner attack: <p ='>' onmouseover="javascript:alert(document/**/.cookie)" >
|
||
|
|
# is returned by HTML::Parser as P_tag("='" => "='") Text( onmouseover...)
|
||
|
|
# which leads to reconstruction of valid HTML. Clever!
|
||
|
|
# detect this, and fail.
|
||
|
|
$total_fail->( $cut, "$tag $attr" );
|
||
|
|
last TOKEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
# ignore attributes that do not fit this strict scheme
|
||
|
|
unless ( $attr =~ /^[\w_:-]+$/ ) {
|
||
|
|
$total_fail->(
|
||
|
|
$cut, "$tag " . ( scalar keys %$hash > 1 ? "[...] " : "" ) . "$attr"
|
||
|
|
);
|
||
|
|
last TOKEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
$hash->{$attr} =~ s/[\t\n]//g;
|
||
|
|
|
||
|
|
# IE ignores the null character, so strip it out
|
||
|
|
$hash->{$attr} =~ s/\x0//g;
|
||
|
|
|
||
|
|
# IE sucks:
|
||
|
|
my $nowhite = $hash->{$attr};
|
||
|
|
$nowhite =~ s/[\s\x0b]+//g;
|
||
|
|
if ( $nowhite =~ /(?:jscript|livescript|javascript|vbscript|^about|data):/ix ) {
|
||
|
|
delete $hash->{$attr};
|
||
|
|
next;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $attr eq 'style' ) {
|
||
|
|
if ( $opts->{'cleancss'} ) {
|
||
|
|
|
||
|
|
# css2 spec, section 4.1.3
|
||
|
|
# position === p\osition :(
|
||
|
|
# strip all slashes no matter what.
|
||
|
|
$hash->{style} =~ s/\\//g;
|
||
|
|
|
||
|
|
# and catch the obvious ones ("[" is for things like document["coo"+"kie"]
|
||
|
|
foreach my $css ( "/*", "[",
|
||
|
|
qw(absolute fixed expression eval behavior cookie document window javascript -moz-binding)
|
||
|
|
)
|
||
|
|
{
|
||
|
|
if ( $hash->{style} =~ /\Q$css\E/i ) {
|
||
|
|
delete $hash->{style};
|
||
|
|
next ATTR;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $opts->{'strongcleancss'} ) {
|
||
|
|
if ( $hash->{style} =~
|
||
|
|
/-moz-|absolute|relative|outline|z-index|(?<!-)(?:top|left|right|bottom)\s*:|filter|-webkit-/io
|
||
|
|
)
|
||
|
|
{
|
||
|
|
delete $hash->{style};
|
||
|
|
next ATTR;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# remove specific CSS definitions
|
||
|
|
if ($remove_colors) {
|
||
|
|
$hash->{style} =~ s/(?:background-)?color:.*?(?:;|$)//gi;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ($remove_sizes) {
|
||
|
|
$hash->{style} =~ s/font-size:.*?(?:;|$)//gi;
|
||
|
|
}
|
||
|
|
elsif ($remove_abs_sizes) {
|
||
|
|
$hash->{style} =~ s/font-size:\s*?\d+.*?(?:;|$)//gi;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ($remove_fonts) {
|
||
|
|
$hash->{style} =~ s/font-family:.*?(?:;|$)//gi;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ($remove_positioning) {
|
||
|
|
$hash->{style} =~ s/margin.*?(?:;|$)//gi;
|
||
|
|
$hash->{style} =~ s/height\s*?:.*?(?:;|$)//gi;
|
||
|
|
$hash->{style} =~ s/display\s*?:\s*?none\s*?(?:;|$)//gi;
|
||
|
|
|
||
|
|
my $too_large = 0;
|
||
|
|
PADDING:
|
||
|
|
while ( $hash->{style} =~ /padding.*?:\s*?(.*?)(?:;|$)/gi ) {
|
||
|
|
my $padding_value = $1;
|
||
|
|
|
||
|
|
foreach ( split /\s+/, $padding_value ) {
|
||
|
|
next unless $_;
|
||
|
|
if ( ( int($_) || 0 ) > 500 ) {
|
||
|
|
$too_large = 1;
|
||
|
|
last PADDING;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
$hash->{style} =~ s/padding.*?(?:;|$)//gi
|
||
|
|
if $too_large;
|
||
|
|
}
|
||
|
|
if ($extractlinks) {
|
||
|
|
$hash->{style} =~ s/url\(.*?\)//gi;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $opts->{'clean_js_css'} && LJ::is_enabled('css_cleaner') ) {
|
||
|
|
|
||
|
|
# and then run it through a harder CSS cleaner that does a full parse
|
||
|
|
my $css = LJ::CSS::Cleaner->new;
|
||
|
|
$hash->{style} = $css->clean_property( $hash->{style} );
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( ( $attr eq 'class' || $attr eq 'id' ) && $opts->{'strongcleancss'} ) {
|
||
|
|
delete $hash->{$attr};
|
||
|
|
next;
|
||
|
|
}
|
||
|
|
|
||
|
|
# reserve ljs_* ids for divs, etc so users can't override them to replace content
|
||
|
|
if ( $attr eq 'id' && $hash->{$attr} =~ /^ljs_/i ) {
|
||
|
|
delete $hash->{$attr};
|
||
|
|
next;
|
||
|
|
}
|
||
|
|
|
||
|
|
# remove specific attributes
|
||
|
|
my %remove_attrs = (
|
||
|
|
color => $remove_colors,
|
||
|
|
bgcolor => $remove_colors,
|
||
|
|
fgcolor => $remove_colors,
|
||
|
|
text => $remove_colors,
|
||
|
|
size => $remove_sizes,
|
||
|
|
face => $remove_fonts,
|
||
|
|
);
|
||
|
|
|
||
|
|
if ( $remove_attrs{$attr} ) {
|
||
|
|
delete $hash->{$attr};
|
||
|
|
next ATTR;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if ( exists $hash->{href} ) {
|
||
|
|
## links to some resources will be completely blocked
|
||
|
|
## and replaced by value of 'blocked_link_substitute' param
|
||
|
|
if ($blocked_links) {
|
||
|
|
foreach my $re (@$blocked_links) {
|
||
|
|
if ( $hash->{href} =~ $re ) {
|
||
|
|
$hash->{href} =
|
||
|
|
sprintf( $blocked_link_substitute, LJ::eurl( $hash->{href} ) );
|
||
|
|
last;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
unless ( $hash->{href} =~ s/^(?:lj|site):(?:\/\/)?(.*)$/ExpandLJURL($1)/ei ) {
|
||
|
|
$hash->{href} = canonical_url( $hash->{href}, 1 );
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $tag eq "img" ) {
|
||
|
|
my $img_bad = 0;
|
||
|
|
|
||
|
|
if ( defined $opts->{'maximgwidth'}
|
||
|
|
&& $hash->{width} > $opts->{maximgwidth} )
|
||
|
|
{
|
||
|
|
$img_bad = 1;
|
||
|
|
}
|
||
|
|
if ( defined $opts->{'maximgheight'}
|
||
|
|
&& $hash->{height} > $opts->{maximgheight} )
|
||
|
|
{
|
||
|
|
$img_bad = 1;
|
||
|
|
}
|
||
|
|
if ( !defined $hash->{width}
|
||
|
|
|| !defined $hash->{height} )
|
||
|
|
{
|
||
|
|
$img_bad ||= $opts->{imageplaceundef};
|
||
|
|
}
|
||
|
|
if ( $opts->{'extractimages'} ) { $img_bad = 1; }
|
||
|
|
|
||
|
|
my $sanitize_url = sub {
|
||
|
|
my $url = canonical_url( $_[0], 1 );
|
||
|
|
return $url if $to_external_site;
|
||
|
|
return https_url( $url, journal => $journal, ditemid => $ditemid );
|
||
|
|
};
|
||
|
|
|
||
|
|
$hash->{src} = $sanitize_url->( $hash->{src} );
|
||
|
|
|
||
|
|
# some responsive images use srcset as well as src;
|
||
|
|
# both attributes should be proxied for https if requested
|
||
|
|
if ( defined $hash->{srcset} ) {
|
||
|
|
$hash->{srcset} =~ s!\b(http://\S+)!$sanitize_url->( $1 )!egi;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ($img_bad) {
|
||
|
|
$newdata .=
|
||
|
|
"<a class=\"ljimgplaceholder\" href=\""
|
||
|
|
. LJ::ehtml( $hash->{'src'} ) . "\">"
|
||
|
|
. LJ::img('placeholder') . '</a>';
|
||
|
|
$alt_output = 1;
|
||
|
|
$opencount{"img"}++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $tag eq "a" && $extractlinks ) {
|
||
|
|
push @canonical_urls, canonical_url( $token->[2]->{href}, 1 );
|
||
|
|
$newdata .= "<b>";
|
||
|
|
next;
|
||
|
|
}
|
||
|
|
|
||
|
|
# Through the xsl namespace in XML, it is possible to embed scripting lanaguages
|
||
|
|
# as elements which will then be executed by the browser. Combining this with
|
||
|
|
# customview.cgi makes it very easy for someone to replace their entire journal
|
||
|
|
# in S1 with a page that embeds scripting as well. An example being an AJAX
|
||
|
|
# six degrees tool, while cool it should not be allowed.
|
||
|
|
#
|
||
|
|
# FIXME Dreamwidth does not support S1 and customview has been removed.
|
||
|
|
#
|
||
|
|
# Example syntax:
|
||
|
|
# <xsl:element name="script">
|
||
|
|
# <xsl:attribute name="type">text/javascript</xsl:attribute>
|
||
|
|
if ( $tag eq 'xsl:attribute' ) {
|
||
|
|
$alt_output = 1; # We'll always deal with output for this token
|
||
|
|
|
||
|
|
my $orig_value = $p->get_text; # Get the value of this element
|
||
|
|
my $value = $orig_value; # Make a copy if this turns out to be alright
|
||
|
|
$value =~ s/\s+//g; # Remove any whitespace
|
||
|
|
|
||
|
|
# See if they are trying to output scripting, if so eat the xsl:attribute
|
||
|
|
# container and its value
|
||
|
|
if ( $value =~ /(javascript|vbscript)/i ) {
|
||
|
|
|
||
|
|
# Remove the closing tag from the tree
|
||
|
|
$p->get_token;
|
||
|
|
|
||
|
|
# Remove the value itself from the tree
|
||
|
|
$p->get_text;
|
||
|
|
|
||
|
|
# No harm, no foul...Write back out the original
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
$newdata .= "$token->[4]$orig_value";
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
unless ($alt_output) {
|
||
|
|
my $allow;
|
||
|
|
if ( $mode eq "allow" ) {
|
||
|
|
$allow = 1;
|
||
|
|
if ( defined $action{$tag} and $action{$tag} eq "deny" ) { $allow = 0; }
|
||
|
|
if ( defined $action{$tag} and $action{$tag} eq "conditional" ) {
|
||
|
|
$allow = $force_allow;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
$allow = 0;
|
||
|
|
if ( defined $action{$tag} and $action{$tag} eq "allow" ) { $allow = 1; }
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $allow && !$remove{$tag} ) {
|
||
|
|
$allow = 0 if
|
||
|
|
|
||
|
|
# can't open table elements from outside a table
|
||
|
|
( $tag =~ /^(?:tbody|thead|tfoot|tr|td|th|caption|colgroup|col)$/
|
||
|
|
&& !@tablescope )
|
||
|
|
||
|
||
|
|
|
||
|
|
# can't open td or th if not inside tr
|
||
|
|
( $tag =~ /^(?:td|th)$/ && !$tablescope[-1]->{'tr'} ) ||
|
||
|
|
|
||
|
|
# can't open a table unless inside a td or th
|
||
|
|
( $tag eq 'table' && @tablescope && !grep { $tablescope[-1]->{$_} }
|
||
|
|
qw(td th) );
|
||
|
|
|
||
|
|
if ($allow) { $newdata .= "<$tag"; }
|
||
|
|
else { $newdata .= "<$tag"; }
|
||
|
|
|
||
|
|
# output attributes in original order, but only those
|
||
|
|
# that are allowed (by still being in %$hash after cleaning)
|
||
|
|
foreach (@$attrs) {
|
||
|
|
unless ( LJ::is_ascii( $hash->{$_} ) ) {
|
||
|
|
|
||
|
|
# FIXME: this isn't nice. make faster. make generic.
|
||
|
|
# HTML::Parser decodes entities for us (which is good)
|
||
|
|
# but in Perl 5.8 also includes the "poison" SvUTF8
|
||
|
|
# flag on the scalar it returns, thus poisoning the
|
||
|
|
# rest of the content this scalar is appended with.
|
||
|
|
# we need to remove that poison at this point. *sigh*
|
||
|
|
$hash->{$_} = LJ::no_utf8_flag( $hash->{$_} );
|
||
|
|
}
|
||
|
|
$newdata .= " $_=\"" . LJ::ehtml( $hash->{$_} ) . "\""
|
||
|
|
if exists $hash->{$_};
|
||
|
|
}
|
||
|
|
|
||
|
|
if ($slashclose) {
|
||
|
|
if ( $tag =~ $slashclose_tags ) {
|
||
|
|
|
||
|
|
# ignore the effects of slashclose unless we're dealing with a tag that can
|
||
|
|
# actually close itself. Otherwise, a tag like <em /> can pass through as valid
|
||
|
|
# even though some browsers just render it as an opening tag
|
||
|
|
|
||
|
|
$newdata .= " /";
|
||
|
|
$opencount{$tag}--;
|
||
|
|
$tablescope[-1]->{$tag}-- if @tablescope;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
# we didn't actually slash close, treat this as a normal opening tag
|
||
|
|
|
||
|
|
$slashclose = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if ($allow) {
|
||
|
|
$newdata .= ">";
|
||
|
|
$opencount{$tag}++;
|
||
|
|
|
||
|
|
# open table
|
||
|
|
if ( $tag eq 'table' ) {
|
||
|
|
push @tablescope, {};
|
||
|
|
|
||
|
|
# new tag within current table
|
||
|
|
}
|
||
|
|
elsif (@tablescope) {
|
||
|
|
$tablescope[-1]->{$tag}++;
|
||
|
|
}
|
||
|
|
|
||
|
|
# we have all this previous logic which makes us
|
||
|
|
# not automatically close tags inside tables
|
||
|
|
# so rather than mess with it, let's just ignore those
|
||
|
|
# and only deal with non-self-closing tags
|
||
|
|
# which are not in a table
|
||
|
|
# (but we still want to close <table>; that's not yet inside the table)
|
||
|
|
push @tagstack, $tag
|
||
|
|
if !$slashclose && ( $tag eq "table" || !@tablescope );
|
||
|
|
}
|
||
|
|
else { $newdata .= ">"; }
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# end tag
|
||
|
|
elsif ( $type eq "E" ) {
|
||
|
|
my $tag = $update_tag->( $token->[1] );
|
||
|
|
next TOKEN if $tag =~ /[^\w\-:]/;
|
||
|
|
|
||
|
|
if (@eatuntil) {
|
||
|
|
push @capture, $token if $capturing_during_eat;
|
||
|
|
|
||
|
|
if ( $eatuntil[-1] eq $tag ) {
|
||
|
|
pop @eatuntil;
|
||
|
|
if ( my $cb = $capturing_during_eat ) {
|
||
|
|
$cb->();
|
||
|
|
$finish_capture->();
|
||
|
|
}
|
||
|
|
next TOKEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
next TOKEN if @eatuntil;
|
||
|
|
}
|
||
|
|
|
||
|
|
# if we're just getting the contents of a cut tag, then pop the
|
||
|
|
# tag off the stack. if this is the last tag on the stack, then
|
||
|
|
# go back to eating the rest of the content.
|
||
|
|
if (@cuttag_stack) {
|
||
|
|
if ( $cuttag_stack[-1] eq $tag ) {
|
||
|
|
pop @cuttag_stack;
|
||
|
|
|
||
|
|
last TOKEN unless (@cuttag_stack);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if ($eatall) {
|
||
|
|
next TOKEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ($eating_ljuser_span) {
|
||
|
|
if ( $tag eq "span" ) {
|
||
|
|
$eating_ljuser_span = 0;
|
||
|
|
$newdata .= user_link_html( $ljuser_text_node, undef, $usertag_opts );
|
||
|
|
}
|
||
|
|
|
||
|
|
next TOKEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
# Hack: For Twitter, which uses blockquotes to embed tweets, re-enable
|
||
|
|
# user conversion once we've exited a blockquote.
|
||
|
|
if ( $disable_user_conversion && $tag eq 'blockquote' ) {
|
||
|
|
$disable_user_conversion = 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
my $allow;
|
||
|
|
if ( $tag eq "lj-raw" && !$nodwtags ) {
|
||
|
|
$opencount{$tag}--;
|
||
|
|
$tablescope[-1]->{$tag}-- if @tablescope;
|
||
|
|
}
|
||
|
|
elsif ( $tag eq "lj-cut" && !$nodwtags ) {
|
||
|
|
|
||
|
|
# Since this is an end-tag, we can't know if it's the closing
|
||
|
|
# div for a faked <div class="ljcut"> tag, which means that
|
||
|
|
# community moderators can't see <b></cut></b> at the end of one
|
||
|
|
# of those tags; if this was a problem, then the 'S' branch of
|
||
|
|
# this function would need to record the ljcut_div flag in a
|
||
|
|
# state variable which is stashed across tokens.
|
||
|
|
if ( $opts->{preserve_lj_tags_for} && $opencount{'lj-cut'} ) {
|
||
|
|
$opencount{'lj-cut'}--;
|
||
|
|
$newdata .= "</lj-cut>";
|
||
|
|
}
|
||
|
|
elsif ( $opts->{'cutpreview'} ) {
|
||
|
|
$newdata .= "<b></cut></b>";
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
if ( $mode eq "allow" ) {
|
||
|
|
$allow = 1;
|
||
|
|
if ( defined $action{$tag}
|
||
|
|
and ( $action{$tag} eq "deny" || $action{$tag} eq "conditional" ) )
|
||
|
|
{
|
||
|
|
$allow = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
$allow = 0;
|
||
|
|
if ( defined $action{$tag} and $action{$tag} eq "allow" ) { $allow = 1; }
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $extractlinks && $tag eq "a" ) {
|
||
|
|
if (@canonical_urls) {
|
||
|
|
my $url = LJ::ehtml( pop @canonical_urls );
|
||
|
|
$newdata .= "</b> ($url)";
|
||
|
|
next;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $allow && !$remove{$tag} ) {
|
||
|
|
$allow = 0 if
|
||
|
|
|
||
|
|
# can't close table elements from outside a table
|
||
|
|
( $tag =~ /^(?:table|tbody|thead|tfoot|tr|td|th|caption|colgroup|col)$/
|
||
|
|
&& !@tablescope )
|
||
|
|
||
|
||
|
|
|
||
|
|
# can't close td or th unless open tr
|
||
|
|
( $tag =~ /^(?:td|th)$/ && !$tablescope[-1]->{'tr'} );
|
||
|
|
|
||
|
|
if ( $allow && !( $opts->{'noearlyclose'} && !$opencount{$tag} ) ) {
|
||
|
|
|
||
|
|
unless (@tablescope) {
|
||
|
|
my $close;
|
||
|
|
while ( ( $close = pop @tagstack ) && $close ne $tag ) {
|
||
|
|
$opencount{$close}--;
|
||
|
|
next if $close =~ $slashclose_tags;
|
||
|
|
$newdata .= "</$close>";
|
||
|
|
push @unclosed_tags, "$close"
|
||
|
|
unless $close eq 'p' || $close eq 'li';
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# open table
|
||
|
|
if ( $tag eq 'table' ) {
|
||
|
|
pop @tablescope;
|
||
|
|
pop @tagstack if $tagstack[-1] eq 'table';
|
||
|
|
|
||
|
|
# closing tag within current table
|
||
|
|
}
|
||
|
|
elsif (@tablescope) {
|
||
|
|
|
||
|
|
# If this tag was not opened inside this table, then
|
||
|
|
# do not close it! (This let's the auto-closer clean
|
||
|
|
# up later.)
|
||
|
|
next TOKEN unless $tablescope[-1]->{$tag};
|
||
|
|
$tablescope[-1]->{$tag}--;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( $opencount{$tag} ) {
|
||
|
|
$newdata .= "</$tag>";
|
||
|
|
$opencount{$tag}--;
|
||
|
|
|
||
|
|
}
|
||
|
|
}
|
||
|
|
elsif ( !$allow || $form_tag->{$tag} && !$opencount{form} ) {
|
||
|
|
|
||
|
|
# tag wasn't allowed, or we have an out of scope form tag? display it then
|
||
|
|
$newdata .= "</$tag>";
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
# This is a closing tag for something that isn't open. We ignore these
|
||
|
|
# and do nothing with them.
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if ( defined $action{$tag}
|
||
|
|
and $action{$tag} eq "conditional" && $tagstack[-1] eq $tag )
|
||
|
|
{
|
||
|
|
$newdata .= "</$tag>";
|
||
|
|
pop @tagstack;
|
||
|
|
$opencount{$tag}--;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
elsif ( $type eq "D" ) {
|
||
|
|
|
||
|
|
# remove everything past first closing tag
|
||
|
|
$token->[1] =~ s/>.+/>/s;
|
||
|
|
|
||
|
|
# kill any opening tag except the starting one
|
||
|
|
$token->[1] =~ s/.<//sg;
|
||
|
|
$newdata .= $token->[1];
|
||
|
|
}
|
||
|
|
elsif ( $type eq "T" ) {
|
||
|
|
my %url = ();
|
||
|
|
my $urlcount = 0;
|
||
|
|
|
||
|
|
if (@eatuntil) {
|
||
|
|
push @capture, $token if $capturing_during_eat;
|
||
|
|
next TOKEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ($eatall) {
|
||
|
|
next TOKEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ($eating_ljuser_span) {
|
||
|
|
$ljuser_text_node = $token->[1];
|
||
|
|
next TOKEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
# auto_format means: the dialect is "html with auto linebreaks," AND
|
||
|
|
# we're not currently in a context that needs to remain raw.
|
||
|
|
my $auto_format =
|
||
|
|
$formatting eq 'html'
|
||
|
|
&& $addbreaks
|
||
|
|
&& ( ( $opencount{table} || 0 ) <= ( $opencount{td} + $opencount{th} ) )
|
||
|
|
&& !$opencount{'pre'}
|
||
|
|
&& !$opencount{'textarea'}
|
||
|
|
&& !$opencount{'lj-raw'};
|
||
|
|
|
||
|
|
# Stash any URLs that should be auto-linked, and insert temporary
|
||
|
|
# placeholders that can survive the next few escaping steps. We'll
|
||
|
|
# restore the URLs later as links.
|
||
|
|
if ( $auto_format && $auto_links && !$opencount{'a'} ) {
|
||
|
|
my $match = sub {
|
||
|
|
my $str = shift;
|
||
|
|
if ( $str =~ /^(.*?)(&(#39|quot|lt|gt)(;.*)?)$/ ) {
|
||
|
|
$url{ ++$urlcount } = $1;
|
||
|
|
return "&url$urlcount;$1&urlend;$2";
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
$url{ ++$urlcount } = $str;
|
||
|
|
return "&url$urlcount;$str&urlend;";
|
||
|
|
}
|
||
|
|
};
|
||
|
|
$token->[1] =~ s!(https?://[^\s\'\"\<\>]+[a-zA-Z0-9_/&=\-])! $match->( $1 ); !ge;
|
||
|
|
}
|
||
|
|
|
||
|
|
# escape tags in text tokens. shouldn't belong here!
|
||
|
|
# especially because the parser returns things it's
|
||
|
|
# confused about (broken, ill-formed HTML) as text.
|
||
|
|
$token->[1] =~ s/</</g;
|
||
|
|
$token->[1] =~ s/>/>/g;
|
||
|
|
|
||
|
|
# auto-format some stuff!
|
||
|
|
if ($auto_format) {
|
||
|
|
|
||
|
|
# Add linebreaks
|
||
|
|
$token->[1] =~ s/\r?\n/<br \/>/g;
|
||
|
|
if ( !$opencount{'a'} ) {
|
||
|
|
|
||
|
|
# Restore any auto-linked URLs as real HTML links
|
||
|
|
$token->[1] =~ s/&url(\d+);(.*?)&urlend;/<a href=\"$url{$1}\">$2<\/a>/g;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# convert user mentions, if we're in an appropriate context
|
||
|
|
if ($at_mentions) {
|
||
|
|
|
||
|
|
# Don't mangle code spans, code blocks, things that act like
|
||
|
|
# code blocks, or things we KNOW have foreign @mentions in em.
|
||
|
|
if ( !$disable_user_conversion
|
||
|
|
&& !$opencount{'code'}
|
||
|
|
&& !$opencount{'pre'}
|
||
|
|
&& !$opencount{'textarea'}
|
||
|
|
&& !$opencount{'lj-raw'} )
|
||
|
|
{
|
||
|
|
convert_user_mentions( \$token->[1], $usertag_opts );
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
$newdata .= $token->[1];
|
||
|
|
}
|
||
|
|
elsif ( $type eq "C" ) {
|
||
|
|
|
||
|
|
# probably a malformed tag rather than a comment, so escape it
|
||
|
|
# -- ehtml things like "<3", "<--->", "<>", etc
|
||
|
|
# -- comments must start with <! to be eaten
|
||
|
|
if ( $token->[1] =~ /^<[^!]/ ) {
|
||
|
|
$newdata .= LJ::ehtml( $token->[1] );
|
||
|
|
|
||
|
|
# by default, ditch comments
|
||
|
|
}
|
||
|
|
elsif ($keepcomments) {
|
||
|
|
my $com = $token->[1];
|
||
|
|
$com =~ s/^<!--\s*//;
|
||
|
|
$com =~ s/\s*--!>$//;
|
||
|
|
$com =~ s/<!--//;
|
||
|
|
$com =~ s/-->//;
|
||
|
|
$newdata .= "<!-- $com -->";
|
||
|
|
}
|
||
|
|
}
|
||
|
|
elsif ( $type eq "PI" ) {
|
||
|
|
my $tok = $token->[1];
|
||
|
|
$tok =~ s/</</g;
|
||
|
|
$tok =~ s/>/>/g;
|
||
|
|
$newdata .= "<?$tok>";
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
$newdata .= "<!-- OTHER: " . $type . "-->\n";
|
||
|
|
}
|
||
|
|
} # end while
|
||
|
|
|
||
|
|
# finish up open links if we're extracting them
|
||
|
|
if ( $extractlinks && @canonical_urls ) {
|
||
|
|
foreach my $url (@canonical_urls) {
|
||
|
|
$newdata .= "</b> (" . LJ::ehtml($url) . ")";
|
||
|
|
$opencount{'a'}--;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# if we have a textarea open, we *MUST* close it first
|
||
|
|
if ( $opencount{textarea} ) {
|
||
|
|
$newdata .= "</textarea>";
|
||
|
|
push @unclosed_tags, "textarea";
|
||
|
|
}
|
||
|
|
$opencount{textarea} = 0;
|
||
|
|
|
||
|
|
# close any tags that were opened and not closed
|
||
|
|
# don't close tags that don't need a closing tag -- otherwise,
|
||
|
|
# we output the closing tags in the wrong place (eg, a </td>
|
||
|
|
# after the <table> was closed) causing unnecessary problems
|
||
|
|
foreach my $tag ( reverse @tagstack ) {
|
||
|
|
next if $tag =~ $slashclose_tags;
|
||
|
|
if ( $opencount{$tag} ) {
|
||
|
|
$newdata .= "</$tag>";
|
||
|
|
$opencount{$tag}--;
|
||
|
|
push @unclosed_tags, $tag unless $tag eq 'p' || $tag eq 'li';
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# If crossposting, explicitly close cuts to keep the crosspost footer visible.
|
||
|
|
if ( $preserve_lj_tags_for && $opencount{'lj-cut'} ) {
|
||
|
|
while ( $opencount{'lj-cut'} > 0 ) {
|
||
|
|
$newdata .= "</lj-cut>";
|
||
|
|
$opencount{'lj-cut'}--;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# extra-paranoid check
|
||
|
|
1 while $newdata =~ s/<script\b//ig;
|
||
|
|
|
||
|
|
$$data = $newdata;
|
||
|
|
$$data .= $extra_text if $extra_text; # invalid markup error
|
||
|
|
|
||
|
|
if ($suspend_msg) {
|
||
|
|
my $msg =
|
||
|
|
qq{<div style="color: #000; font: 12px Verdana, Arial, Sans-Serif; background-color: #ffeeee; background-repeat: repeat-x; border: 1px solid #ff9999; padding: 8px; margin: 5px auto; width: auto; text-align: left; background-image: url('$LJ::IMGPREFIX/message-error.gif');">};
|
||
|
|
|
||
|
|
$msg .= LJ::Lang::ml('cleanhtml.suspend_msg');
|
||
|
|
$msg .= "</div>";
|
||
|
|
|
||
|
|
$$data = $msg . $$data;
|
||
|
|
}
|
||
|
|
|
||
|
|
# only add verbose errors for unclosed tags if we don't have another verbose error set
|
||
|
|
# otherwise, the tags error will overwrite more important errors like irreparable markup
|
||
|
|
if ( $verbose_err && ref($verbose_err) eq 'SCALAR' && scalar(@unclosed_tags) > 0 ) {
|
||
|
|
my $tag_str = "<" . join( ">, <", @unclosed_tags ) . ">";
|
||
|
|
$$verbose_err = { error => ".error.markup.unclosed", opts => { tags => $tag_str } };
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
# takes a reference to HTML and a base URL, and modifies HTML in place to use absolute URLs from the given base
|
||
|
|
sub resolve_relative_urls {
|
||
|
|
my ( $data, $base ) = @_;
|
||
|
|
my $p = HTML::TokeParser->new($data);
|
||
|
|
|
||
|
|
# where we look for relative URLs
|
||
|
|
my $rel_source = {
|
||
|
|
'a' => {
|
||
|
|
'href' => 1,
|
||
|
|
},
|
||
|
|
'img' => {
|
||
|
|
'src' => 1,
|
||
|
|
},
|
||
|
|
};
|
||
|
|
|
||
|
|
my $global_did_mod = 0;
|
||
|
|
my $base_uri = undef; # until needed
|
||
|
|
my $newdata = "";
|
||
|
|
|
||
|
|
TOKEN:
|
||
|
|
while ( my $token = $p->get_token ) {
|
||
|
|
my $type = $token->[0];
|
||
|
|
|
||
|
|
if ( $type eq "S" ) # start tag
|
||
|
|
{
|
||
|
|
my $tag = $token->[1];
|
||
|
|
my $hash = $token->[2]; # attribute hashref
|
||
|
|
my $attrs = $token->[3]; # attribute names, in original order
|
||
|
|
|
||
|
|
my $did_mod = 0;
|
||
|
|
|
||
|
|
# see if this is a tag that could contain relative URLs we fix up.
|
||
|
|
if ( my $relats = $rel_source->{$tag} ) {
|
||
|
|
while ( my $k = each %$relats ) {
|
||
|
|
next unless defined $hash->{$k} && $hash->{$k} !~ /^[a-z]+:/;
|
||
|
|
my $rel_url = $hash->{$k};
|
||
|
|
$global_did_mod = $did_mod = 1;
|
||
|
|
|
||
|
|
$base_uri ||= URI->new($base);
|
||
|
|
$hash->{$k} = URI->new_abs( $rel_url, $base_uri )->as_string;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# if no change was necessary
|
||
|
|
unless ($did_mod) {
|
||
|
|
$newdata .= $token->[4];
|
||
|
|
next TOKEN;
|
||
|
|
}
|
||
|
|
|
||
|
|
# otherwise, rebuild the opening tag
|
||
|
|
|
||
|
|
# for tags like <name/>, pretend it's <name> and reinsert the slash later
|
||
|
|
my $slashclose = 0; # If set to 1, use XML-style empty tag marker
|
||
|
|
$slashclose = 1 if $tag =~ s!/$!!;
|
||
|
|
$slashclose = 1 if delete $hash->{'/'};
|
||
|
|
|
||
|
|
# spit it back out
|
||
|
|
$newdata .= "<$tag";
|
||
|
|
|
||
|
|
# output attributes in original order
|
||
|
|
foreach (@$attrs) {
|
||
|
|
$newdata .= " $_=\"" . LJ::ehtml( $hash->{$_} ) . "\""
|
||
|
|
if exists $hash->{$_};
|
||
|
|
}
|
||
|
|
$newdata .= " /" if $slashclose;
|
||
|
|
$newdata .= ">";
|
||
|
|
}
|
||
|
|
elsif ( $type eq "E" ) {
|
||
|
|
$newdata .= $token->[2];
|
||
|
|
}
|
||
|
|
elsif ( $type eq "D" ) {
|
||
|
|
$newdata .= $token->[1];
|
||
|
|
}
|
||
|
|
elsif ( $type eq "T" ) {
|
||
|
|
$newdata .= $token->[1];
|
||
|
|
}
|
||
|
|
elsif ( $type eq "C" ) {
|
||
|
|
$newdata .= $token->[1];
|
||
|
|
}
|
||
|
|
elsif ( $type eq "PI" ) {
|
||
|
|
$newdata .= $token->[2];
|
||
|
|
}
|
||
|
|
} # end while
|
||
|
|
|
||
|
|
$$data = $newdata if $global_did_mod;
|
||
|
|
return undef;
|
||
|
|
}
|
||
|
|
|
||
|
|
sub ExpandLJURL {
|
||
|
|
my @args = grep { $_ } split( /\//, $_[0] );
|
||
|
|
my $mode = shift @args;
|
||
|
|
|
||
|
|
my %modes = (
|
||
|
|
'faq' => sub {
|
||
|
|
my $id = shift() + 0;
|
||
|
|
if ($id) {
|
||
|
|
return "support/faqbrowse?faqid=$id";
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
return "support/faq";
|
||
|
|
}
|
||
|
|
},
|
||
|
|
'memories' => sub {
|
||
|
|
my $user = LJ::canonical_username(shift);
|
||
|
|
if ($user) {
|
||
|
|
return "memories?user=$user";
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
return "memories";
|
||
|
|
}
|
||
|
|
},
|
||
|
|
'support' => sub {
|
||
|
|
my $id = shift() + 0;
|
||
|
|
if ($id) {
|
||
|
|
return "support/see_request?id=$id";
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
return "support/";
|
||
|
|
}
|
||
|
|
},
|
||
|
|
'user' => sub {
|
||
|
|
my $user = LJ::canonical_username(shift);
|
||
|
|
return "" if grep { /[\"\'\<\>\n\&]/ } @_;
|
||
|
|
return $_[0] eq 'profile'
|
||
|
|
? "profile?user=$user"
|
||
|
|
: "users/$user/" . join( "", map { "$_/" } @_ );
|
||
|
|
},
|
||
|
|
'userinfo' => sub {
|
||
|
|
my $user = LJ::canonical_username(shift);
|
||
|
|
if ($user) {
|
||
|
|
return "profile?user=$user";
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
return "profile";
|
||
|
|
}
|
||
|
|
},
|
||
|
|
'userpics' => sub {
|
||
|
|
my $user = LJ::canonical_username(shift);
|
||
|
|
if ($user) {
|
||
|
|
return "allpics?user=$user";
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
return "allpics";
|
||
|
|
}
|
||
|
|
},
|
||
|
|
);
|
||
|
|
|
||
|
|
my $uri = $modes{$mode} ? $modes{$mode}->(@args) : "error:bogus-lj-url";
|
||
|
|
|
||
|
|
return "$LJ::SITEROOT/$uri";
|
||
|
|
}
|
||
|
|
|
||
|
|
my $subject_eat = [qw[ head title style layer iframe applet object xml param base ]];
|
||
|
|
my $subject_allow = [qw[a b i u em strong cite]];
|
||
|
|
my $subject_remove = [qw[bgsound embed object caption link font noscript]];
|
||
|
|
|
||
|
|
sub clean_subject {
|
||
|
|
my $ref = shift;
|
||
|
|
return unless defined $$ref and $$ref =~ /[\<\>]/;
|
||
|
|
clean(
|
||
|
|
$ref,
|
||
|
|
{
|
||
|
|
addbreaks => 0,
|
||
|
|
eat => $subject_eat,
|
||
|
|
mode => 'deny',
|
||
|
|
allow => $subject_allow,
|
||
|
|
remove => $subject_remove,
|
||
|
|
noearlyclose => 1,
|
||
|
|
|
||
|
|
# This is wrong in some cases, but clean_subject is used by many
|
||
|
|
# different paths that don't tell us where the content came from.
|
||
|
|
# Let's assume the most conservative for now.
|
||
|
|
formatting => 'html',
|
||
|
|
at_mentions => 0,
|
||
|
|
}
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
## returns a pure text subject (needed in links, email headers, etc...)
|
||
|
|
my $subjectall_eat = $subject_eat;
|
||
|
|
|
||
|
|
sub clean_subject_all {
|
||
|
|
my $ref = shift;
|
||
|
|
return unless $$ref =~ /[\<\>]/;
|
||
|
|
clean(
|
||
|
|
$ref,
|
||
|
|
{
|
||
|
|
addbreaks => 0,
|
||
|
|
eat => $subjectall_eat,
|
||
|
|
mode => 'deny',
|
||
|
|
textonly => 1,
|
||
|
|
noearlyclose => 1,
|
||
|
|
|
||
|
|
# This is wrong in some cases, but clean_subject is used by many
|
||
|
|
# different paths that don't tell us where the content came from.
|
||
|
|
# Let's assume the most conservative for now.
|
||
|
|
formatting => 'html',
|
||
|
|
at_mentions => 0,
|
||
|
|
}
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
# wrapper around clean_subject_all; this also trims the subject to the given length
|
||
|
|
sub clean_and_trim_subject {
|
||
|
|
my ( $ref, $length, $truncated ) = @_;
|
||
|
|
$length ||= 40;
|
||
|
|
|
||
|
|
LJ::CleanHTML::clean_subject_all($ref);
|
||
|
|
$$ref =~ s/\n.*//s;
|
||
|
|
$$ref = LJ::text_trim( $$ref, 0, $length, $truncated );
|
||
|
|
}
|
||
|
|
|
||
|
|
my @comment_eat = qw( head title style layer iframe applet object );
|
||
|
|
my @comment_anon_eat = (
|
||
|
|
@comment_eat, qw(
|
||
|
|
table tbody thead tfoot tr td th caption colgroup col font
|
||
|
|
)
|
||
|
|
);
|
||
|
|
|
||
|
|
my @comment_all = qw(
|
||
|
|
table tr td th tbody tfoot thead colgroup caption col
|
||
|
|
a sub sup xmp bdo q span
|
||
|
|
b i u tt s strike big small font
|
||
|
|
abbr acronym cite code dfn em kbd samp strong var del ins
|
||
|
|
h1 h2 h3 h4 h5 h6 div blockquote address pre center
|
||
|
|
ul ol li dl dt dd
|
||
|
|
area map form textarea
|
||
|
|
img br hr p col
|
||
|
|
summary details
|
||
|
|
);
|
||
|
|
|
||
|
|
my $event_eat = $subject_eat;
|
||
|
|
my $event_remove = [qw[ bgsound embed object link body meta noscript plaintext noframes ]];
|
||
|
|
|
||
|
|
my $userbio_eat = $event_eat;
|
||
|
|
my $userbio_remove = $event_remove;
|
||
|
|
|
||
|
|
# An "event" is the body text of a journal entry. But this also gets called for
|
||
|
|
# several unrelated things that LOOK a lot like journal entries, like FAQ items
|
||
|
|
# and support requests.
|
||
|
|
sub clean_event {
|
||
|
|
my ( $ref, $opts ) = @_;
|
||
|
|
return unless $$ref; # nothing to do
|
||
|
|
|
||
|
|
# old prototype was passing in the ref and preformatted flag.
|
||
|
|
# now the second argument is a hashref of options, so convert it to support the old way.
|
||
|
|
unless ( ref $opts eq "HASH" ) {
|
||
|
|
$opts = { 'preformatted' => $opts };
|
||
|
|
}
|
||
|
|
|
||
|
|
# Formatting is specified with the `editor` prop, if present...
|
||
|
|
my $formatting = $opts->{editor};
|
||
|
|
|
||
|
|
# ...and if not, here's how we guess.
|
||
|
|
if ( !$formatting ) {
|
||
|
|
if ( legacy_markdown($ref) ) {
|
||
|
|
|
||
|
|
# Old-style Markdown: remove the special !markdown prefix,
|
||
|
|
# and switch to Markdown formatting.
|
||
|
|
$formatting = 'markdown0';
|
||
|
|
}
|
||
|
|
elsif ( $opts->{is_syndicated} ) {
|
||
|
|
|
||
|
|
# MOST of the time, synsuck sets opt_preformatted, in which case
|
||
|
|
# this is plain old HTML that doesn't know anything about our
|
||
|
|
# special DW/LJ tags.
|
||
|
|
if ( $opts->{preformatted} ) {
|
||
|
|
$formatting = 'html_extra_raw';
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
# BUT, if a feed post contains zero HTML tags, synsuck doesn't
|
||
|
|
# set preformatted. Those posts can use a "wrong" format that at
|
||
|
|
# least handles line breaks legibly.
|
||
|
|
$formatting = 'html_casual0';
|
||
|
|
}
|
||
|
|
}
|
||
|
|
elsif ( $opts->{preformatted} ) {
|
||
|
|
$formatting = 'html_raw0';
|
||
|
|
}
|
||
|
|
elsif ( $opts->{is_imported} ) {
|
||
|
|
|
||
|
|
# LJ et al don't have @mentions
|
||
|
|
$formatting = 'html_casual0';
|
||
|
|
}
|
||
|
|
elsif ( $opts->{logtime_mysql} && $opts->{logtime_mysql} lt '2019-05' ) {
|
||
|
|
|
||
|
|
# Before @mentions were rolled out
|
||
|
|
$formatting = 'html_casual0';
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
$formatting = 'html_casual1';
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
my %formatting_args = formatting_args($formatting);
|
||
|
|
|
||
|
|
clean(
|
||
|
|
$ref,
|
||
|
|
{
|
||
|
|
%formatting_args,
|
||
|
|
|
||
|
|
cuturl => $opts->{cuturl},
|
||
|
|
cutpreview => $opts->{cutpreview},
|
||
|
|
eat => $event_eat,
|
||
|
|
mode => 'allow',
|
||
|
|
remove => $event_remove,
|
||
|
|
cleancss => 1,
|
||
|
|
maximgwidth => $opts->{maximgwidth},
|
||
|
|
maximgheight => $opts->{maximgheight},
|
||
|
|
imageplaceundef => $opts->{imageplaceundef},
|
||
|
|
ljcut_disable => $opts->{ljcut_disable},
|
||
|
|
noearlyclose => 1,
|
||
|
|
extractimages => $opts->{extractimages} ? 1 : 0,
|
||
|
|
noexpandembedded => $opts->{noexpandembedded} ? 1 : 0,
|
||
|
|
textonly => $opts->{textonly} ? 1 : 0,
|
||
|
|
remove_colors => $opts->{remove_colors} ? 1 : 0,
|
||
|
|
remove_sizes => $opts->{remove_sizes} ? 1 : 0,
|
||
|
|
remove_fonts => $opts->{remove_fonts} ? 1 : 0,
|
||
|
|
transform_embed_nocheck => $opts->{transform_embed_nocheck} ? 1 : 0,
|
||
|
|
transform_embed_wmode => $opts->{transform_embed_wmode},
|
||
|
|
rewrite_embed_param => $opts->{rewrite_embed_param} ? 1 : 0,
|
||
|
|
suspend_msg => $opts->{suspend_msg} ? 1 : 0,
|
||
|
|
to_external_site => $opts->{to_external_site} ? 1 : 0,
|
||
|
|
preserve_lj_tags_for => $opts->{preserve_lj_tags_for} || 0,
|
||
|
|
cut_retrieve => $opts->{cut_retrieve},
|
||
|
|
journal => $opts->{journal},
|
||
|
|
ditemid => $opts->{ditemid},
|
||
|
|
errref => $opts->{errref},
|
||
|
|
verbose_err => $opts->{verbose_err},
|
||
|
|
}
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
# clean JS out of embed module
|
||
|
|
sub clean_embed {
|
||
|
|
my ( $ref, $opts ) = @_;
|
||
|
|
return unless $$ref;
|
||
|
|
return unless LJ::is_enabled('embedmodule-cleancontent');
|
||
|
|
|
||
|
|
clean(
|
||
|
|
$ref,
|
||
|
|
{
|
||
|
|
addbreaks => 0,
|
||
|
|
mode => 'allow',
|
||
|
|
allow => [qw( object embed )],
|
||
|
|
deny => [qw( script )],
|
||
|
|
remove => [qw( script )],
|
||
|
|
conditional => [qw( iframe )],
|
||
|
|
ljcut_disable => 1,
|
||
|
|
cleancss => 1,
|
||
|
|
extractlinks => 0,
|
||
|
|
noautolinks => 1,
|
||
|
|
extractimages => 0,
|
||
|
|
noexpandembedded => 1,
|
||
|
|
transform_embed_nocheck => 1,
|
||
|
|
rewrite_embed_param => 1,
|
||
|
|
force_https_embed => $opts->{display_as_content},
|
||
|
|
|
||
|
|
# Embeds always come from somewhere else, so be conservative.
|
||
|
|
formatting => 'html',
|
||
|
|
at_mentions => 0,
|
||
|
|
}
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
sub get_okay_comment_tags {
|
||
|
|
return @comment_all;
|
||
|
|
}
|
||
|
|
|
||
|
|
# ref: scalarref of text to clean, gets cleaned in-place
|
||
|
|
# opts: either a hashref of opts:
|
||
|
|
# - preformatted: if true, don't insert breaks and auto-linkify
|
||
|
|
# - anon_comment: don't linkify things, and prevent <a> tags
|
||
|
|
# or, opts can just be a boolean scalar, which implies the performatted tag
|
||
|
|
sub clean_comment {
|
||
|
|
my ( $ref, $opts ) = @_;
|
||
|
|
return 0 unless defined $$ref;
|
||
|
|
|
||
|
|
$opts = { preformatted => $opts } unless ref $opts;
|
||
|
|
|
||
|
|
# Formatting is specified with the `editor` prop, if present...
|
||
|
|
my $formatting = $opts->{editor};
|
||
|
|
|
||
|
|
# ...and if not, here's how we guess.
|
||
|
|
# Differences from entries: No magic !markdown, no syndicated content.
|
||
|
|
if ( !$formatting ) {
|
||
|
|
if ( $opts->{preformatted} ) {
|
||
|
|
$formatting = 'html_raw0';
|
||
|
|
}
|
||
|
|
elsif ( $opts->{is_imported} ) {
|
||
|
|
|
||
|
|
# LJ et al don't have @mentions
|
||
|
|
$formatting = 'html_casual0';
|
||
|
|
}
|
||
|
|
elsif ( $opts->{datepost} && $opts->{datepost} lt '2019-05' ) {
|
||
|
|
|
||
|
|
# Before @mentions were rolled out
|
||
|
|
$formatting = 'html_casual0';
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
$formatting = 'html_casual1';
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
my %formatting_args = formatting_args($formatting);
|
||
|
|
|
||
|
|
return clean(
|
||
|
|
$ref,
|
||
|
|
{
|
||
|
|
%formatting_args,
|
||
|
|
|
||
|
|
eat => $opts->{anon_comment} ? \@comment_anon_eat : \@comment_eat,
|
||
|
|
mode => 'deny',
|
||
|
|
allow => \@comment_all,
|
||
|
|
cleancss => 1,
|
||
|
|
strongcleancss => 1,
|
||
|
|
extractlinks => $opts->{anon_comment},
|
||
|
|
extractimages => $opts->{anon_comment},
|
||
|
|
noearlyclose => 1,
|
||
|
|
nocss => $opts->{nocss},
|
||
|
|
textonly => $opts->{textonly} ? 1 : 0,
|
||
|
|
remove_positioning => 1,
|
||
|
|
remove_abs_sizes => $opts->{anon_comment},
|
||
|
|
}
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
sub clean_userbio {
|
||
|
|
my ( $ref, $strip_links ) = @_;
|
||
|
|
return undef unless ref $ref;
|
||
|
|
|
||
|
|
clean(
|
||
|
|
$ref,
|
||
|
|
{
|
||
|
|
addbreaks => 1,
|
||
|
|
attrstrip => [qw[style]],
|
||
|
|
mode => 'allow',
|
||
|
|
noearlyclose => 1,
|
||
|
|
eat => $userbio_eat,
|
||
|
|
remove => $userbio_remove,
|
||
|
|
cleancss => 1,
|
||
|
|
|
||
|
|
# Bios are always local, but for now, we are marking them as
|
||
|
|
# HTML so that people don't have to reformat everything.
|
||
|
|
formatting => 'html',
|
||
|
|
at_mentions => 1,
|
||
|
|
noautolinks => $strip_links,
|
||
|
|
extractlinks => $strip_links,
|
||
|
|
}
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
sub canonical_url {
|
||
|
|
my ( $url, $allow_all ) = @_;
|
||
|
|
$url ||= '';
|
||
|
|
|
||
|
|
# strip leading and trailing spaces
|
||
|
|
$url =~ s/^\s*//;
|
||
|
|
$url =~ s/\s*$//;
|
||
|
|
|
||
|
|
return '' unless $url;
|
||
|
|
|
||
|
|
unless ($allow_all) {
|
||
|
|
|
||
|
|
# see what protocol they want, default to http
|
||
|
|
my $pref = "http";
|
||
|
|
$pref = $1 if $url =~ /^(https?|ftp|webcal):/;
|
||
|
|
|
||
|
|
# strip out the protocol section
|
||
|
|
$url =~ s!^.*?:/*!!;
|
||
|
|
|
||
|
|
return '' unless $url;
|
||
|
|
|
||
|
|
# rebuild safe url
|
||
|
|
$url = "$pref://$url";
|
||
|
|
}
|
||
|
|
|
||
|
|
return $url;
|
||
|
|
}
|
||
|
|
|
||
|
|
sub https_url {
|
||
|
|
my ( $url, %opts ) = @_;
|
||
|
|
|
||
|
|
# https:// and the relative // protocol don't need proxying
|
||
|
|
return $url if $url =~ m!^(?:https://|//)!;
|
||
|
|
|
||
|
|
# if this link is on a site that supports HTTPS, upgrade the protocol
|
||
|
|
my $https_ok = %LJ::KNOWN_HTTPS_SITES ? \%LJ::KNOWN_HTTPS_SITES : {};
|
||
|
|
my ($domain) = ( $url =~ m!^http://[^/]*?([^.]+\.\w{2,3})/! );
|
||
|
|
|
||
|
|
if ( $domain && ( $domain eq $LJ::DOMAIN || $https_ok->{$domain} ) ) {
|
||
|
|
$url =~ s!^http:!https:!;
|
||
|
|
return $url;
|
||
|
|
}
|
||
|
|
|
||
|
|
return DW::Proxy::get_proxy_url(
|
||
|
|
$url,
|
||
|
|
journal => $opts{journal},
|
||
|
|
ditemid => $opts{ditemid}
|
||
|
|
) || $url;
|
||
|
|
}
|
||
|
|
|
||
|
|
sub quote_html {
|
||
|
|
my ( $string, $bool ) = @_;
|
||
|
|
return $string unless $bool;
|
||
|
|
|
||
|
|
# quote all non-LJ tags
|
||
|
|
$string =~ s{<(?!/?lj)(.*?)>} {<$1>}gi;
|
||
|
|
return $string;
|
||
|
|
}
|
||
|
|
|
||
|
|
# Map of known markup formats to the clean() options that implement their
|
||
|
|
# behavior. We only care about "real" format IDs here -- aliases are handled by
|
||
|
|
# DW::Formats. (You could argue the clean options belong in DW::Formats too, but
|
||
|
|
# this was a compromise to keep other code from knowing too much about the
|
||
|
|
# cleaner's internals.)
|
||
|
|
my %markup_formats = (
|
||
|
|
html_casual0 => {
|
||
|
|
formatting => 'html',
|
||
|
|
addbreaks => 1,
|
||
|
|
at_mentions => 0,
|
||
|
|
},
|
||
|
|
html_casual1 => {
|
||
|
|
formatting => 'html',
|
||
|
|
addbreaks => 1,
|
||
|
|
at_mentions => 1,
|
||
|
|
},
|
||
|
|
html_raw0 => {
|
||
|
|
formatting => 'html',
|
||
|
|
addbreaks => 0,
|
||
|
|
at_mentions => 0,
|
||
|
|
noautolinks => 1,
|
||
|
|
},
|
||
|
|
html_extra_raw => {
|
||
|
|
formatting => 'html',
|
||
|
|
addbreaks => 0,
|
||
|
|
at_mentions => 0,
|
||
|
|
noautolinks => 1,
|
||
|
|
nodwtags => 1,
|
||
|
|
},
|
||
|
|
markdown0 => {
|
||
|
|
formatting => 'markdown',
|
||
|
|
addbreaks => 0,
|
||
|
|
at_mentions => 1,
|
||
|
|
noautolinks => 1,
|
||
|
|
},
|
||
|
|
);
|
||
|
|
|
||
|
|
# Return a group of arguments to pass to clean() based on the requested format
|
||
|
|
sub formatting_args {
|
||
|
|
my $format = DW::Formats::validate( $_[0] );
|
||
|
|
my $args = $markup_formats{$format};
|
||
|
|
unless ( ref $args ) {
|
||
|
|
$args = $markup_formats{$DW::Formats::default_format};
|
||
|
|
}
|
||
|
|
return %$args;
|
||
|
|
}
|
||
|
|
|
||
|
|
sub convert_user_mentions {
|
||
|
|
my ( $ref, $opts ) = @_;
|
||
|
|
|
||
|
|
my $usertag = sub {
|
||
|
|
my ( $orig, $user, $site ) = ( $_[0], $_[1], $_[2] || $LJ::DOMAIN );
|
||
|
|
|
||
|
|
# atproto sites use FQDNs as usernames, so if the final segment of $site
|
||
|
|
# refers to one, the rest of $site is actually a part of the username.
|
||
|
|
my ( $atuser, $atsitename ) = split /\.([^.]+)$/, ( $orig =~ tr/@//dr );
|
||
|
|
if ($atsitename) {
|
||
|
|
my $atsite = DW::External::Site->get_site( site => $atsitename );
|
||
|
|
return user_link_html( $atuser, $atsitename, $opts )
|
||
|
|
if $atsite && $atsite->servicetype eq "atproto";
|
||
|
|
}
|
||
|
|
return user_link_html( $user, $site, $opts );
|
||
|
|
};
|
||
|
|
|
||
|
|
# First pass is just to look for an edge case where an unescaped
|
||
|
|
# username that needs to be converted is the first item in the string.
|
||
|
|
$$ref =~ s!^(\@([\w\-]+)(?:\.([\w\-\.]*[\w\-]))?)(?=$|\W)!$usertag->($1, $2, $3)!mge;
|
||
|
|
|
||
|
|
# Second pass is to look for all other occurrences of unescaped usernames.
|
||
|
|
# If we find an escaped username, remove the escape sequence and continue.
|
||
|
|
# We also have to look for (and explicitly ignore) Markdown-supported escape
|
||
|
|
# sequences here, to avoid parsing edge cases like '\\@foo' incorrectly
|
||
|
|
# (note that's two user-supplied backslashes). That's why the (\\.) case is
|
||
|
|
# actually (\\.) and not (\\\@).
|
||
|
|
$$ref =~ s!(\\.)|(?<=[^\w/])(\@([\w\-]+)(?:\.([\w\-\.]*[\w\-]))?)(?=$|\W)!
|
||
|
|
defined($1) ? ( $1 eq '\@' ? '@' : $1 ) : $usertag->($2, $3, $4)
|
||
|
|
!mge;
|
||
|
|
}
|
||
|
|
|
||
|
|
# Keys available in opts hashref:
|
||
|
|
# - textonly (ignored by ljuser_display et al.)
|
||
|
|
# - preserve_lj_tags_for (ignored by ljuser_display et al.)
|
||
|
|
# - no_ljuser_class
|
||
|
|
# - no_link
|
||
|
|
sub user_link_html {
|
||
|
|
|
||
|
|
# Generate HTML to link to a user
|
||
|
|
my ( $user, $site, $opts ) = @_;
|
||
|
|
|
||
|
|
# allow external sites
|
||
|
|
# do not use link to an external site if site attribute is current domain
|
||
|
|
if ( defined $site && $site ne $LJ::DOMAIN ) {
|
||
|
|
|
||
|
|
# try to load this user@site combination
|
||
|
|
if ( my $ext_u = DW::External::User->new( user => $user, site => $site ) ) {
|
||
|
|
|
||
|
|
# looks good, render
|
||
|
|
if ( $ext_u->site eq $opts->{preserve_lj_tags_for} ) {
|
||
|
|
|
||
|
|
# that's the crosspost destination, so send a native user tag.
|
||
|
|
return qq{<lj user="$user">};
|
||
|
|
}
|
||
|
|
elsif ( $opts->{textonly} ) {
|
||
|
|
|
||
|
|
# FIXME: need a textonly way of identifying users better? "user@LJ"?
|
||
|
|
return $user;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
return $ext_u->ljuser_display(%$opts);
|
||
|
|
}
|
||
|
|
|
||
|
|
# if we hit the else, then we know that this user doesn't appear
|
||
|
|
# to be valid at the requested site
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
return
|
||
|
|
"<b>[Bad username or site: "
|
||
|
|
. LJ::ehtml( LJ::no_utf8_flag($user) ) . " @ "
|
||
|
|
. LJ::ehtml( LJ::no_utf8_flag($site) ) . "]</b>";
|
||
|
|
}
|
||
|
|
|
||
|
|
# failing that, no site or local site, use the local behavior
|
||
|
|
}
|
||
|
|
elsif ( length $user ) {
|
||
|
|
if ( my $u = LJ::load_user_or_identity($user) ) {
|
||
|
|
if ( $opts->{textonly} ) {
|
||
|
|
return $u->display_name;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
return $u->ljuser_display($opts);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
elsif ( my $username = LJ::canonical_username($user) ) {
|
||
|
|
return LJ::ljuser( $user, $opts );
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
$user = LJ::no_utf8_flag($user);
|
||
|
|
return "<b>[Bad username or unknown identity: " . LJ::ehtml($user) . "]</b>";
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
return "<b>[Unknown site tag]</b>";
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# detect and remove "!markdown" at beginning of entry text;
|
||
|
|
# return true if the marker was found, false otherwise
|
||
|
|
sub legacy_markdown {
|
||
|
|
my ($ref) = @_;
|
||
|
|
return $$ref =~ s/^\s*!markdown\s*\r?\n//is;
|
||
|
|
}
|
||
|
|
|
||
|
|
1;
|