# This code was forked from the LiveJournal project owned and operated # by Live Journal, Inc. The code has been modified and expanded by # Dreamwidth Studios, LLC. These files were originally licensed under # the terms of the license supplied by Live Journal, Inc, which can # currently be found at: # # http://code.livejournal.org/trac/livejournal/browser/trunk/LICENSE-LiveJournal.txt # # In accordance with the original license, this code and all its # modifications are provided under the GNU General Public License. # A copy of that license can be found in the LICENSE file included as # part of this distribution. package LJ::CleanHTML; use strict; use URI; use HTMLCleaner; use LJ::CSS::Cleaner; use HTML::TokeParser; use LJ::EmbedModule; use LJ::Config; use Text::Markdown; use LJ::TextUtil; use DW::Formats; use DW::External::Site; LJ::Config->load; # attempt to mangle an email address for printing out to HTML. this is # kind of futile, but we try anyway. sub mangle_email_address { my $email = $_[0]; $email =~ s!^(.+)@(.+)$!$1@$2!; return $email; } # LJ::CleanHTML::clean(\$u->{'bio'}, { # 'addbreaks' => 1, # insert
after newlines where appropriate # 'eat' => [qw(head title style layer iframe)], # 'mode' => 'allow', # 'deny' => [qw(marquee)], # 'remove' => [qw()], # 'maximgwidth' => 100, # 'maximgheight' => 100, # 'keepcomments' => 1, # 'cuturl' => 'http://www.domain.com/full_item_view.ext', # 'ljcut_disable' => 1, # stops the cleaner from using the lj-cut tag # 'cleancss' => 1, # 'extractlinks' => 1, # remove a hrefs; implies noautolinks # 'noautolinks' => 1, # do not auto linkify # 'extractimages' => 1, # placeholder images # 'transform_embed_nocheck' => 1, # do not do checks on object/embed tag transforming # 'transform_embed_wmode' => , # define a wmode value for videos (usually 'transparent' is the value you want) # 'blocked_links' => [ qr/evil\.com/, qw/spammer\.com/ ], # list of sites which URL's will be blocked # 'blocked_link_substitute' => 'http://domain.com/error.html' # blocked links will be replaced by this URL # 'to_external_site' => 0, # flag for when the content is going to be fed to external sites, so it can be special-cased. e.g., feeds # }); sub helper_preload { my $p = HTML::TokeParser->new(""); eval { $p->DESTROY(); }; } # this treats normal characters and &entities; as single characters # also treats UTF-8 chars as single characters my $onechar; { my $utf_longchar = '[\xc2-\xdf][\x80-\xbf]|\xe0[\xa0-\xbf][\x80-\xbf]|[\xe1-\xef][\x80-\xbf][\x80-\xbf]|\xf0[\x90-\xbf][\x80-\xbf][\x80-\xbf]|[\xf1-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]'; my $match = $utf_longchar . '|[^&\s\x80-\xff]|(?:&\#?\w{1,7};)'; $onechar = qr/$match/o; } # In XHTML you can close a tag in the same opening tag like
, # but some browsers still will interpret it as an opening only tag. # This is a list of tags which you can actually close with a trailing # slash and get the proper behavior from a browser. # # In HTML5 these are called "void elements". my $slashclose_tags = qr/^(?:area|base|basefont|br|col|embed|frame|hr|img|input|isindex|link|meta|param|source|track|wbr|lj-embed|site-embed|poll-\d+|lj-poll-\d+)$/i; # # name: LJ::CleanHTML::clean # class: text # des: Multi-faceted HTML parse function # info: # args: data, opts # des-data: A reference to HTML to parse to output, or HTML if modified in-place. # des-opts: An hash of options to pass to the parser. # returns: Nothing. # sub clean { my $data = shift; return undef unless defined $$data; my $opts = shift; # this has to be an empty string because otherwise we might never actually append # anything to it if $$data contains only invalid content my $newdata = ''; # Set up configuration and defaults: my $addbreaks = $opts->{addbreaks}; # \n ->
my $keepcomments = $opts->{keepcomments}; my $mode = $opts->{mode}; my $nodwtags = $opts->{nodwtags} || 0; # Disable all special DW/LJ tags my $cut = $opts->{cuturl} || $opts->{cutpreview}; my $ljcut_disable = $opts->{ljcut_disable}; my $extractlinks = 0 || $opts->{extractlinks}; # Links become `text (url)` my $noexpand_embedded = $opts->{noexpandembedded} || $opts->{textonly} || 0; my $transform_embed_nocheck = $opts->{transform_embed_nocheck} || 0; my $transform_embed_wmode = $opts->{transform_embed_wmode}; my $rewrite_embed_param = $opts->{rewrite_embed_param} || 0; my $remove_colors = $opts->{remove_colors} || 0; my $remove_sizes = $opts->{remove_sizes} || 0; my $remove_abs_sizes = $opts->{remove_abs_sizes} || 0; my $remove_fonts = $opts->{remove_fonts} || 0; my $at_mentions = $opts->{at_mentions} || 0; # @person.place -> user tag my $formatting = $opts->{formatting} // 'html'; # html, or do we need to convert? my $auto_links = !( $extractlinks || $opts->{noautolinks} ); $auto_links = 0 if $formatting ne 'html'; $cut = 0 if $nodwtags; $at_mentions = 0 if $nodwtags; my $blocked_links = ( exists $opts->{'blocked_links'} ) ? $opts->{'blocked_links'} : \@LJ::BLOCKED_LINKS; my $blocked_link_substitute = ( exists $opts->{'blocked_link_substitute'} ) ? $opts->{'blocked_link_substitute'} : ($LJ::BLOCKED_LINK_SUBSTITUTE) ? $LJ::BLOCKED_LINK_SUBSTITUTE : '#'; my $suspend_msg = $opts->{'suspend_msg'} || 0; my $to_external_site = $opts->{to_external_site} || 0; my $preserve_lj_tags_for = $opts->{preserve_lj_tags_for} || 0; # False or site name my $remove_positioning = $opts->{'remove_positioning'} || 0; my $errref = $opts->{errref}; my $verbose_err = $opts->{verbose_err}; # Verbose parse errors my @unclosed_tags; # for ajax cut tag parsing my $cut_retrieve = $opts->{cut_retrieve} || 0; my $journal = $opts->{journal} || ""; my $ditemid = $opts->{ditemid} || ""; my %action = (); my %remove = (); if ( ref $opts->{'allow'} eq "ARRAY" ) { foreach ( @{ $opts->{'allow'} } ) { $action{$_} = "allow"; } } if ( ref $opts->{'eat'} eq "ARRAY" ) { foreach ( @{ $opts->{'eat'} } ) { $action{$_} = "eat"; } } if ( ref $opts->{'deny'} eq "ARRAY" ) { foreach ( @{ $opts->{'deny'} } ) { $action{$_} = "deny"; } } if ( ref $opts->{'remove'} eq "ARRAY" ) { foreach ( @{ $opts->{'remove'} } ) { $action{$_} = "deny"; $remove{$_} = 1; } } if ( ref $opts->{'conditional'} eq "ARRAY" ) { foreach ( @{ $opts->{'conditional'} } ) { $action{$_} = "conditional"; } } $action{'script'} = "eat"; # if removing sizes, remove heading tags if ($remove_sizes) { foreach my $tag (qw( h1 h2 h3 h4 h5 h6 )) { $action{$tag} = "deny"; $remove{$tag} = 1; } } if ( $opts->{'strongcleancss'} ) { $opts->{'cleancss'} = 1; } my @attrstrip = qw(); # cleancss means clean annoying css # clean_js_css means clean javascript from css if ( $opts->{'cleancss'} ) { push @attrstrip, 'id'; $opts->{'clean_js_css'} = 1; } if ( $opts->{'nocss'} ) { push @attrstrip, 'style'; } if ( ref $opts->{'attrstrip'} eq "ARRAY" ) { foreach ( @{ $opts->{'attrstrip'} } ) { push @attrstrip, $_; } } # Do some preprocessing of the input text before we try to parse it as HTML: # First, remove the auth portion of any see_request links $$data = LJ::strip_request_auth($$data); # Second, convert Markdown; from here on, we can process it as raw HTML (no autoformatting) if ( $formatting eq 'markdown' ) { $$data = Text::Markdown::markdown($$data); $addbreaks = 0; } # Create the HTML parser we'll use to navigate the text from here on out: my $p = HTML::TokeParser->new($data); # Set up state variables: my @canonical_urls; # extracted links my %opencount = map { $_ => 0 } qw(td th); my @tablescope = (); my $cutcount = 0; # bytes known good. set this BEFORE we start parsing any new # start tag, where most evil is (because where attributes can be) # then, if we have to totally fail, we can cut stuff off after this. my $good_until = 0; # then, if we decide that part of an entry has invalid content, we'll # escape that part and stuff it in here. this lets us finish cleaning # the "good" part of the entry (since some tags might not get closed # till after $good_until bytes into the text). my $extra_text; my $total_fail = sub { my ( $cuturl, $tag ) = @_; $tag = LJ::ehtml($tag); my $err_str; my $edata = LJ::ehtml($$data); $edata =~ s/\r?\n/
/g if $addbreaks; if ($cuturl) { my $cutlink = LJ::ehtml($cuturl); $err_str = '.error.markup'; $extra_text = "" . LJ::Lang::ml( 'cleanhtml.error.markup', { aopts => "href='$cutlink'" } ) . ""; } else { $err_str = { error => '.error.markup.extra', opts => { aopts => $tag } }; $extra_text = LJ::Lang::ml( 'cleanhtml.error.markup.extra', { aopts => $tag } ) . "

" . '

' . $edata . '

'; } $extra_text = "

$extra_text

"; $$verbose_err = $err_str if $verbose_err; $$errref = "parseerror" if $errref; }; my $htmlcleaner = HTMLCleaner->new( valid_stylesheet => \&LJ::valid_stylesheet_url ); my $eating_ljuser_span = 0; # bool, if we're eating an ljuser span my $ljuser_text_node = ""; # the last text node we saw while eating ljuser tags my @eatuntil = (); # if non-empty, we're eating everything. thing at end is thing # we're looking to open again or close again. my $capturing_during_eat; # if we save all tokens that happen inside the eating. my @capture = (); # if so, they go here my @tagstack = (); # so we can make sure that tags are closed properly/in order my $disable_user_conversion = 0; my $form_tag = { input => 1, select => 1, option => 1, }; my $start_capture = sub { next if $capturing_during_eat; my ( $tag, $first_token, $cb ) = @_; push @eatuntil, $tag; @capture = ($first_token); $capturing_during_eat = $cb || sub { }; }; my $finish_capture = sub { @capture = (); $capturing_during_eat = undef; }; # we now allow users to use new tags that aren't "lj" tags. this short # stub allows us to "upgrade" the tag. my $tag_updates = { 'cut' => 'lj-cut', 'poll' => 'lj-poll', 'poll-item' => 'lj-pi', 'poll-question' => 'lj-pq', 'raw-code' => 'lj-raw', 'site-embed' => 'lj-embed', 'user' => 'lj', }; my $update_tag = sub { return $tag_updates->{ $_[0] } || $_[0]; }; my $usertag_opts = { textonly => $opts->{textonly} ? 1 : 0, preserve_lj_tags_for => $opts->{preserve_lj_tags_for} || 0, no_ljuser_class => $opts->{to_external_site} ? 1 : 0, no_link => 0, }; # if we're retrieving a cut tag, then we want to eat everything # until we hit the first cut tag. my @cuttag_stack = (); my $eatall = $cut_retrieve ? 1 : 0; TOKEN: while ( my $token = $p->get_token ) { my $type = $token->[0]; $usertag_opts->{no_link} = $opencount{'a'} ? 1 : 0; if ( $type eq "S" ) # start tag { my $tag = $update_tag->( $token->[1] ); my $attr = $token->[2]; # hashref my $ljcut_div = $tag eq "div" && defined lc $attr->{class} && lc $attr->{class} eq "ljcut"; $good_until = length $newdata; if (@eatuntil) { push @capture, $token if $capturing_during_eat; # have to keep the cut counts consistent even if they're nested if ( $tag eq "lj-cut" || $ljcut_div ) { $cutcount++; } if ( $tag eq $eatuntil[-1] ) { push @eatuntil, $tag; } next TOKEN; } # if we're looking for cut tags, ignore everything that's # not a cut tag. if ( $eatall && $tag ne "lj-cut" && !$ljcut_div ) { next TOKEN; } if ( $tag eq "lj-template" && !$noexpand_embedded && !$nodwtags ) { my $name = $attr->{name} || ""; $name =~ s/-/_/g; my $run_template_hook = sub { # deprecated - will always print an error msg (see #1869) $newdata .= "" . LJ::Lang::ml( 'cleanhtml.error.template', { aopts => LJ::ehtml($name) } ) . ""; }; if ( $attr->{'/'} ) { # template is self-closing, no need to do capture $run_template_hook->( $token, 1 ); } else { # capture and send content to hook $start_capture->( "lj-template", $token, $run_template_hook ); } next TOKEN; } # Capture object and embed tags to possibly transform them into something else. if ( $tag eq "object" || $tag eq "embed" ) { if ( LJ::Hooks::are_hooks("transform_embed") && !$noexpand_embedded ) { # XHTML style open/close tags done as a singleton shouldn't actually # start a capture loop, because there won't be a close tag. if ( $attr->{'/'} ) { $newdata .= LJ::Hooks::run_hook( "transform_embed", [$token], nocheck => $transform_embed_nocheck, wmode => $transform_embed_wmode ) || ""; next TOKEN; } $start_capture->( $tag, $token, sub { my $expanded = LJ::Hooks::run_hook( "transform_embed", \@capture, nocheck => $transform_embed_nocheck, wmode => $transform_embed_wmode ); $newdata .= $expanded || ""; } ); next TOKEN; } } if ( $tag eq "embed" && $rewrite_embed_param ) { $attr->{allowscriptaccess} = "sameDomain" if exists $attr->{allowscriptaccess} && $attr->{allowscriptaccess} ne 'never'; } if ( $tag eq "param" && $rewrite_embed_param && $opencount{object} && lc( $attr->{name} ) eq 'allowscriptaccess' ) { $attr->{value} = "sameDomain" if $attr->{value} ne 'never'; } if ( $tag eq "span" && lc $attr->{class} eq "ljuser" && !$noexpand_embedded && !$nodwtags ) { $eating_ljuser_span = 1; $ljuser_text_node = ""; } if ($eating_ljuser_span) { next TOKEN; } # deprecated - will always print an error msg (see #1869) if ( ( $tag eq "div" || $tag eq "span" ) && defined $attr->{class} && lc $attr->{class} eq "ljvideo" ) { $start_capture->( $tag, $token, sub { $newdata .= "" . LJ::Lang::ml('cleanhtml.error.template.video') . ""; } ); next TOKEN; } # do some quick checking to see if this is an email address/URL, and if so, just # escape it and ignore it if ( $tag =~ m!(?:\@|://)! ) { $newdata .= LJ::ehtml("<$tag>"); next; } if ( $form_tag->{$tag} ) { if ( !$opencount{form} ) { $newdata .= "<$tag ... >"; next; } if ( $tag eq "input" ) { if ( $attr->{type} !~ /^\w+$/ || lc $attr->{type} eq "password" ) { delete $attr->{type}; } } } my $slashclose = 0; # If set to 1, use XML-style empty tag marker # for tags like , pretend it's and reinsert the slash later $slashclose = 1 if ( $tag =~ s!/$!! ); unless ( $tag =~ /^\w([\w\-:_]*\w)?$/ ) { $total_fail->( $cut, $tag ); last TOKEN; } # for incorrect tags like (note the lack of a space) # delete everything after 'name' to prevent a security loophole which happens # because IE understands them. $tag =~ s!/.+$!!; if ( defined $action{$tag} and $action{$tag} eq "eat" ) { $p->unget_token($token); $p->get_tag("/$tag"); next; } # force this specific instance of the tag to be allowed (for conditional) my $force_allow = 0; if ( defined $action{$tag} and $action{$tag} eq "conditional" ) { if ( $tag eq "iframe" ) { my $can_https; ( $force_allow, $can_https ) = LJ::Hooks::run_hook( 'allow_iframe_embeds', $attr->{src} ); $attr->{src} =~ s!^https?:!! if $opts->{force_https_embed} && $can_https; # convert to protocol-relative URL unless ($force_allow) { ## eat this tag if ( !$attr->{'/'} ) { ## if not autoclosed tag (