so we don't have to account for it below.
my $user = $attr->{user} =
exists $attr->{name} ? $attr->{name}
: exists $attr->{user} ? $attr->{user}
: exists $attr->{comm} ? $attr->{comm}
: undef;
$newdata .= user_link_html( $user, $attr->{site}, $usertag_opts );
}
elsif ( $tag eq "lj-raw" && !$nodwtags ) {
# Strip it out, but still register it as being open
$opencount{$tag}++;
}
# Don't allow any tag with the "set" attribute
elsif ( $tag =~ m/:set$/ ) {
next;
}
else {
my $alt_output = 0;
my $hash = $token->[2];
my $attrs = $token->[3]; # attribute names, in original order
$slashclose = 1 if delete $hash->{'/'};
foreach (@attrstrip) {
# maybe there's a better place for this?
next if ( lc $tag eq 'lj-embed' && lc $_ eq 'id' );
delete $hash->{$_};
}
if ( $tag eq "form" ) {
my $action = lc( $hash->{'action'} );
my $deny = 0;
if ( $action =~ m!^https?://?([^/]+)! ) {
my $host = $1;
$deny = 1
if $host =~ /[%\@\s]/
|| $LJ::FORM_DOMAIN_BANNED{$host};
}
else {
$deny = 1;
}
delete $hash->{'action'} if $deny;
}
ATTR:
foreach my $attr ( keys %$hash ) {
if ( $attr =~ /^(?:on|dynsrc)/ ) {
delete $hash->{$attr};
next;
}
if ( $attr eq "data" ) {
delete $hash->{$attr};
# type specifies the content type for the data specified by "data"
# without the data, this has no useful effect
# but may cause the object tag not to use the fallback values in Firefox
delete $hash->{"type"};
next;
}
if ( $attr =~ /(?:^=)|[\x0b\x0d]/ ) {
# Cleaner attack: ' onmouseover="javascript:alert(document/**/.cookie)" >
# is returned by HTML::Parser as P_tag("='" => "='") Text( onmouseover...)
# which leads to reconstruction of valid HTML. Clever!
# detect this, and fail.
$total_fail->( $cut, "$tag $attr" );
last TOKEN;
}
# ignore attributes that do not fit this strict scheme
unless ( $attr =~ /^[\w_:-]+$/ ) {
$total_fail->(
$cut, "$tag " . ( scalar keys %$hash > 1 ? "[...] " : "" ) . "$attr"
);
last TOKEN;
}
$hash->{$attr} =~ s/[\t\n]//g;
# IE ignores the null character, so strip it out
$hash->{$attr} =~ s/\x0//g;
# IE sucks:
my $nowhite = $hash->{$attr};
$nowhite =~ s/[\s\x0b]+//g;
if ( $nowhite =~ /(?:jscript|livescript|javascript|vbscript|^about|data):/ix ) {
delete $hash->{$attr};
next;
}
if ( $attr eq 'style' ) {
if ( $opts->{'cleancss'} ) {
# css2 spec, section 4.1.3
# position === p\osition :(
# strip all slashes no matter what.
$hash->{style} =~ s/\\//g;
# and catch the obvious ones ("[" is for things like document["coo"+"kie"]
foreach my $css ( "/*", "[",
qw(absolute fixed expression eval behavior cookie document window javascript -moz-binding)
)
{
if ( $hash->{style} =~ /\Q$css\E/i ) {
delete $hash->{style};
next ATTR;
}
}
if ( $opts->{'strongcleancss'} ) {
if ( $hash->{style} =~
/-moz-|absolute|relative|outline|z-index|(?{style};
next ATTR;
}
}
# remove specific CSS definitions
if ($remove_colors) {
$hash->{style} =~ s/(?:background-)?color:.*?(?:;|$)//gi;
}
if ($remove_sizes) {
$hash->{style} =~ s/font-size:.*?(?:;|$)//gi;
}
elsif ($remove_abs_sizes) {
$hash->{style} =~ s/font-size:\s*?\d+.*?(?:;|$)//gi;
}
if ($remove_fonts) {
$hash->{style} =~ s/font-family:.*?(?:;|$)//gi;
}
if ($remove_positioning) {
$hash->{style} =~ s/margin.*?(?:;|$)//gi;
$hash->{style} =~ s/height\s*?:.*?(?:;|$)//gi;
$hash->{style} =~ s/display\s*?:\s*?none\s*?(?:;|$)//gi;
my $too_large = 0;
PADDING:
while ( $hash->{style} =~ /padding.*?:\s*?(.*?)(?:;|$)/gi ) {
my $padding_value = $1;
foreach ( split /\s+/, $padding_value ) {
next unless $_;
if ( ( int($_) || 0 ) > 500 ) {
$too_large = 1;
last PADDING;
}
}
}
$hash->{style} =~ s/padding.*?(?:;|$)//gi
if $too_large;
}
if ($extractlinks) {
$hash->{style} =~ s/url\(.*?\)//gi;
}
}
if ( $opts->{'clean_js_css'} && LJ::is_enabled('css_cleaner') ) {
# and then run it through a harder CSS cleaner that does a full parse
my $css = LJ::CSS::Cleaner->new;
$hash->{style} = $css->clean_property( $hash->{style} );
}
}
if ( ( $attr eq 'class' || $attr eq 'id' ) && $opts->{'strongcleancss'} ) {
delete $hash->{$attr};
next;
}
# reserve ljs_* ids for divs, etc so users can't override them to replace content
if ( $attr eq 'id' && $hash->{$attr} =~ /^ljs_/i ) {
delete $hash->{$attr};
next;
}
# remove specific attributes
my %remove_attrs = (
color => $remove_colors,
bgcolor => $remove_colors,
fgcolor => $remove_colors,
text => $remove_colors,
size => $remove_sizes,
face => $remove_fonts,
);
if ( $remove_attrs{$attr} ) {
delete $hash->{$attr};
next ATTR;
}
}
if ( exists $hash->{href} ) {
## links to some resources will be completely blocked
## and replaced by value of 'blocked_link_substitute' param
if ($blocked_links) {
foreach my $re (@$blocked_links) {
if ( $hash->{href} =~ $re ) {
$hash->{href} =
sprintf( $blocked_link_substitute, LJ::eurl( $hash->{href} ) );
last;
}
}
}
unless ( $hash->{href} =~ s/^(?:lj|site):(?:\/\/)?(.*)$/ExpandLJURL($1)/ei ) {
$hash->{href} = canonical_url( $hash->{href}, 1 );
}
}
if ( $tag eq "img" ) {
my $img_bad = 0;
if ( defined $opts->{'maximgwidth'}
&& $hash->{width} > $opts->{maximgwidth} )
{
$img_bad = 1;
}
if ( defined $opts->{'maximgheight'}
&& $hash->{height} > $opts->{maximgheight} )
{
$img_bad = 1;
}
if ( !defined $hash->{width}
|| !defined $hash->{height} )
{
$img_bad ||= $opts->{imageplaceundef};
}
if ( $opts->{'extractimages'} ) { $img_bad = 1; }
my $sanitize_url = sub {
my $url = canonical_url( $_[0], 1 );
return $url if $to_external_site;
return https_url( $url, journal => $journal, ditemid => $ditemid );
};
$hash->{src} = $sanitize_url->( $hash->{src} );
# some responsive images use srcset as well as src;
# both attributes should be proxied for https if requested
if ( defined $hash->{srcset} ) {
$hash->{srcset} =~ s!\b(http://\S+)!$sanitize_url->( $1 )!egi;
}
if ($img_bad) {
$newdata .=
"{'src'} ) . "\">"
. LJ::img('placeholder') . '';
$alt_output = 1;
$opencount{"img"}++;
}
}
if ( $tag eq "a" && $extractlinks ) {
push @canonical_urls, canonical_url( $token->[2]->{href}, 1 );
$newdata .= "";
next;
}
# Through the xsl namespace in XML, it is possible to embed scripting lanaguages
# as elements which will then be executed by the browser. Combining this with
# customview.cgi makes it very easy for someone to replace their entire journal
# in S1 with a page that embeds scripting as well. An example being an AJAX
# six degrees tool, while cool it should not be allowed.
#
# FIXME Dreamwidth does not support S1 and customview has been removed.
#
# Example syntax:
#
# text/javascript
if ( $tag eq 'xsl:attribute' ) {
$alt_output = 1; # We'll always deal with output for this token
my $orig_value = $p->get_text; # Get the value of this element
my $value = $orig_value; # Make a copy if this turns out to be alright
$value =~ s/\s+//g; # Remove any whitespace
# See if they are trying to output scripting, if so eat the xsl:attribute
# container and its value
if ( $value =~ /(javascript|vbscript)/i ) {
# Remove the closing tag from the tree
$p->get_token;
# Remove the value itself from the tree
$p->get_text;
# No harm, no foul...Write back out the original
}
else {
$newdata .= "$token->[4]$orig_value";
}
}
unless ($alt_output) {
my $allow;
if ( $mode eq "allow" ) {
$allow = 1;
if ( defined $action{$tag} and $action{$tag} eq "deny" ) { $allow = 0; }
if ( defined $action{$tag} and $action{$tag} eq "conditional" ) {
$allow = $force_allow;
}
}
else {
$allow = 0;
if ( defined $action{$tag} and $action{$tag} eq "allow" ) { $allow = 1; }
}
if ( $allow && !$remove{$tag} ) {
$allow = 0 if
# can't open table elements from outside a table
( $tag =~ /^(?:tbody|thead|tfoot|tr|td|th|caption|colgroup|col)$/
&& !@tablescope )
||
# can't open td or th if not inside tr
( $tag =~ /^(?:td|th)$/ && !$tablescope[-1]->{'tr'} ) ||
# can't open a table unless inside a td or th
( $tag eq 'table' && @tablescope && !grep { $tablescope[-1]->{$_} }
qw(td th) );
if ($allow) { $newdata .= "<$tag"; }
else { $newdata .= "<$tag"; }
# output attributes in original order, but only those
# that are allowed (by still being in %$hash after cleaning)
foreach (@$attrs) {
unless ( LJ::is_ascii( $hash->{$_} ) ) {
# FIXME: this isn't nice. make faster. make generic.
# HTML::Parser decodes entities for us (which is good)
# but in Perl 5.8 also includes the "poison" SvUTF8
# flag on the scalar it returns, thus poisoning the
# rest of the content this scalar is appended with.
# we need to remove that poison at this point. *sigh*
$hash->{$_} = LJ::no_utf8_flag( $hash->{$_} );
}
$newdata .= " $_=\"" . LJ::ehtml( $hash->{$_} ) . "\""
if exists $hash->{$_};
}
if ($slashclose) {
if ( $tag =~ $slashclose_tags ) {
# ignore the effects of slashclose unless we're dealing with a tag that can
# actually close itself. Otherwise, a tag like can pass through as valid
# even though some browsers just render it as an opening tag
$newdata .= " /";
$opencount{$tag}--;
$tablescope[-1]->{$tag}-- if @tablescope;
}
else {
# we didn't actually slash close, treat this as a normal opening tag
$slashclose = 0;
}
}
if ($allow) {
$newdata .= ">";
$opencount{$tag}++;
# open table
if ( $tag eq 'table' ) {
push @tablescope, {};
# new tag within current table
}
elsif (@tablescope) {
$tablescope[-1]->{$tag}++;
}
# we have all this previous logic which makes us
# not automatically close tags inside tables
# so rather than mess with it, let's just ignore those
# and only deal with non-self-closing tags
# which are not in a table
# (but we still want to close ; that's not yet inside the table)
push @tagstack, $tag
if !$slashclose && ( $tag eq "table" || !@tablescope );
}
else { $newdata .= ">"; }
}
}
}
}
# end tag
elsif ( $type eq "E" ) {
my $tag = $update_tag->( $token->[1] );
next TOKEN if $tag =~ /[^\w\-:]/;
if (@eatuntil) {
push @capture, $token if $capturing_during_eat;
if ( $eatuntil[-1] eq $tag ) {
pop @eatuntil;
if ( my $cb = $capturing_during_eat ) {
$cb->();
$finish_capture->();
}
next TOKEN;
}
next TOKEN if @eatuntil;
}
# if we're just getting the contents of a cut tag, then pop the
# tag off the stack. if this is the last tag on the stack, then
# go back to eating the rest of the content.
if (@cuttag_stack) {
if ( $cuttag_stack[-1] eq $tag ) {
pop @cuttag_stack;
last TOKEN unless (@cuttag_stack);
}
}
if ($eatall) {
next TOKEN;
}
if ($eating_ljuser_span) {
if ( $tag eq "span" ) {
$eating_ljuser_span = 0;
$newdata .= user_link_html( $ljuser_text_node, undef, $usertag_opts );
}
next TOKEN;
}
# Hack: For Twitter, which uses blockquotes to embed tweets, re-enable
# user conversion once we've exited a blockquote.
if ( $disable_user_conversion && $tag eq 'blockquote' ) {
$disable_user_conversion = 0;
}
my $allow;
if ( $tag eq "lj-raw" && !$nodwtags ) {
$opencount{$tag}--;
$tablescope[-1]->{$tag}-- if @tablescope;
}
elsif ( $tag eq "lj-cut" && !$nodwtags ) {
# Since this is an end-tag, we can't know if it's the closing
# div for a faked tag, which means that
# community moderators can't see
at the end of one
# of those tags; if this was a problem, then the 'S' branch of
# this function would need to record the ljcut_div flag in a
# state variable which is stashed across tokens.
if ( $opts->{preserve_lj_tags_for} && $opencount{'lj-cut'} ) {
$opencount{'lj-cut'}--;
$newdata .= "";
}
elsif ( $opts->{'cutpreview'} ) {
$newdata .= "
</cut>";
}
}
else {
if ( $mode eq "allow" ) {
$allow = 1;
if ( defined $action{$tag}
and ( $action{$tag} eq "deny" || $action{$tag} eq "conditional" ) )
{
$allow = 0;
}
}
else {
$allow = 0;
if ( defined $action{$tag} and $action{$tag} eq "allow" ) { $allow = 1; }
}
if ( $extractlinks && $tag eq "a" ) {
if (@canonical_urls) {
my $url = LJ::ehtml( pop @canonical_urls );
$newdata .= " ($url)";
next;
}
}
if ( $allow && !$remove{$tag} ) {
$allow = 0 if
# can't close table elements from outside a table
( $tag =~ /^(?:table|tbody|thead|tfoot|tr|td|th|caption|colgroup|col)$/
&& !@tablescope )
||
# can't close td or th unless open tr
( $tag =~ /^(?:td|th)$/ && !$tablescope[-1]->{'tr'} );
if ( $allow && !( $opts->{'noearlyclose'} && !$opencount{$tag} ) ) {
unless (@tablescope) {
my $close;
while ( ( $close = pop @tagstack ) && $close ne $tag ) {
$opencount{$close}--;
next if $close =~ $slashclose_tags;
$newdata .= "$close>";
push @unclosed_tags, "$close"
unless $close eq 'p' || $close eq 'li';
}
}
# open table
if ( $tag eq 'table' ) {
pop @tablescope;
pop @tagstack if $tagstack[-1] eq 'table';
# closing tag within current table
}
elsif (@tablescope) {
# If this tag was not opened inside this table, then
# do not close it! (This let's the auto-closer clean
# up later.)
next TOKEN unless $tablescope[-1]->{$tag};
$tablescope[-1]->{$tag}--;
}
if ( $opencount{$tag} ) {
$newdata .= "$tag>";
$opencount{$tag}--;
}
}
elsif ( !$allow || $form_tag->{$tag} && !$opencount{form} ) {
# tag wasn't allowed, or we have an out of scope form tag? display it then
$newdata .= "</$tag>";
}
else {
# This is a closing tag for something that isn't open. We ignore these
# and do nothing with them.
}
}
if ( defined $action{$tag}
and $action{$tag} eq "conditional" && $tagstack[-1] eq $tag )
{
$newdata .= "$tag>";
pop @tagstack;
$opencount{$tag}--;
}
}
}
elsif ( $type eq "D" ) {
# remove everything past first closing tag
$token->[1] =~ s/>.+/>/s;
# kill any opening tag except the starting one
$token->[1] =~ s/./sg;
$newdata .= $token->[1];
}
elsif ( $type eq "T" ) {
my %url = ();
my $urlcount = 0;
if (@eatuntil) {
push @capture, $token if $capturing_during_eat;
next TOKEN;
}
if ($eatall) {
next TOKEN;
}
if ($eating_ljuser_span) {
$ljuser_text_node = $token->[1];
next TOKEN;
}
# auto_format means: the dialect is "html with auto linebreaks," AND
# we're not currently in a context that needs to remain raw.
my $auto_format =
$formatting eq 'html'
&& $addbreaks
&& ( ( $opencount{table} || 0 ) <= ( $opencount{td} + $opencount{th} ) )
&& !$opencount{'pre'}
&& !$opencount{'textarea'}
&& !$opencount{'lj-raw'};
# Stash any URLs that should be auto-linked, and insert temporary
# placeholders that can survive the next few escaping steps. We'll
# restore the URLs later as links.
if ( $auto_format && $auto_links && !$opencount{'a'} ) {
my $match = sub {
my $str = shift;
if ( $str =~ /^(.*?)(&(#39|quot|lt|gt)(;.*)?)$/ ) {
$url{ ++$urlcount } = $1;
return "&url$urlcount;$1&urlend;$2";
}
else {
$url{ ++$urlcount } = $str;
return "&url$urlcount;$str&urlend;";
}
};
$token->[1] =~ s!(https?://[^\s\'\"\<\>]+[a-zA-Z0-9_/&=\-])! $match->( $1 ); !ge;
}
# escape tags in text tokens. shouldn't belong here!
# especially because the parser returns things it's
# confused about (broken, ill-formed HTML) as text.
$token->[1] =~ s/</g;
$token->[1] =~ s/>/>/g;
# auto-format some stuff!
if ($auto_format) {
# Add linebreaks
$token->[1] =~ s/\r?\n/
/g;
if ( !$opencount{'a'} ) {
# Restore any auto-linked URLs as real HTML links
$token->[1] =~ s/&url(\d+);(.*?)&urlend;/
$2<\/a>/g;
}
}
# convert user mentions, if we're in an appropriate context
if ($at_mentions) {
# Don't mangle code spans, code blocks, things that act like
# code blocks, or things we KNOW have foreign @mentions in em.
if ( !$disable_user_conversion
&& !$opencount{'code'}
&& !$opencount{'pre'}
&& !$opencount{'textarea'}
&& !$opencount{'lj-raw'} )
{
convert_user_mentions( \$token->[1], $usertag_opts );
}
}
$newdata .= $token->[1];
}
elsif ( $type eq "C" ) {
# probably a malformed tag rather than a comment, so escape it
# -- ehtml things like "<3", "<--->", "<>", etc
# -- comments must start with [1] =~ /^<[^!]/ ) {
$newdata .= LJ::ehtml( $token->[1] );
# by default, ditch comments
}
elsif ($keepcomments) {
my $com = $token->[1];
$com =~ s/^$//;
$com =~ s///;
$newdata .= "";
}
}
elsif ( $type eq "PI" ) {
my $tok = $token->[1];
$tok =~ s/</g;
$tok =~ s/>/>/g;
$newdata .= "$tok>";
}
else {
$newdata .= "\n";
}
} # end while
# finish up open links if we're extracting them
if ( $extractlinks && @canonical_urls ) {
foreach my $url (@canonical_urls) {
$newdata .= " (" . LJ::ehtml($url) . ")";
$opencount{'a'}--;
}
}
# if we have a textarea open, we *MUST* close it first
if ( $opencount{textarea} ) {
$newdata .= "";
push @unclosed_tags, "textarea";
}
$opencount{textarea} = 0;
# close any tags that were opened and not closed
# don't close tags that don't need a closing tag -- otherwise,
# we output the closing tags in the wrong place (eg, a
# after the was closed) causing unnecessary problems
foreach my $tag ( reverse @tagstack ) {
next if $tag =~ $slashclose_tags;
if ( $opencount{$tag} ) {
$newdata .= "$tag>";
$opencount{$tag}--;
push @unclosed_tags, $tag unless $tag eq 'p' || $tag eq 'li';
}
}
# If crossposting, explicitly close cuts to keep the crosspost footer visible.
if ( $preserve_lj_tags_for && $opencount{'lj-cut'} ) {
while ( $opencount{'lj-cut'} > 0 ) {
$newdata .= "";
$opencount{'lj-cut'}--;
}
}
# extra-paranoid check
1 while $newdata =~ s/