# This code was forked from the LiveJournal project owned and operated # by Live Journal, Inc. The code has been modified and expanded by # Dreamwidth Studios, LLC. These files were originally licensed under # the terms of the license supplied by Live Journal, Inc, which can # currently be found at: # # http://code.livejournal.org/trac/livejournal/browser/trunk/LICENSE-LiveJournal.txt # # In accordance with the original license, this code and all its # modifications are provided under the GNU General Public License. # A copy of that license can be found in the LICENSE file included as # part of this distribution. package LJ; use strict; use LJ::ConvUTF8; use HTML::TokeParser; use HTML::Entities; use Carp qw(cluck); use Encode; # # name: LJ::trim # class: text # des: Removes whitespace from left and right side of a string. # args: string # des-string: string to be trimmed # returns: trimmed string # sub trim { my $a = $_[0]; return '' unless defined $a; $a =~ s/^\s+//; $a =~ s/\s+$//; return $a; } # check argument text for see_request links, and strip any auth args sub strip_request_auth { my $a = $_[0]; return '' unless defined $a; $a =~ s/(see_request\S+?)\&auth=\w+/$1/ig; return $a; } # # name: LJ::get_urls # class: text # des: Returns a list of all referenced URLs from a string. # args: text # des-text: Text from which to return extra URLs. # returns: list of URLs # sub get_urls { return ( $_[0] =~ m!https?://[^\s\"\'\<\>]+!g ); } # similar to decode_url_string below, but a nicer calling convention. returns # a hash of items parsed from the string passed in as the only argument. # FIXME: This method using \0 is being used in legacy locations # however should be factored out ( to Hash::MultiValue ) # as soon as the need for the legacy use is removed. sub parse_args { my $args = $_[0]; return unless defined $args; my %GET; foreach my $pair ( split /&/, $args ) { my ( $name, $value ) = split /=/, $pair; if ( defined $value ) { $value =~ tr/+/ /; $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; } else { $value = ''; } $name =~ tr/+/ /; $name =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; $GET{$name} .= $GET{$name} ? "\0$value" : $value; } return %GET; } # # name: LJ::decode_url_string # class: web # des: Parse URL-style arg/value pairs into a hash. # args: buffer, hashref # des-buffer: Scalar or scalarref of buffer to parse. # des-hashref: Hashref to populate. # returns: boolean; true. # sub decode_url_string { my $a = shift; my $buffer = ref $a ? $a : \$a; my $hashref = shift; # output hash my $keyref = shift; # array of keys as they were found my $pair; my @pairs = split( /&/, $$buffer ); @$keyref = @pairs; my ( $name, $value ); foreach $pair (@pairs) { ( $name, $value ) = split( /=/, $pair ); $value =~ tr/+/ /; $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; $name =~ tr/+/ /; $name =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; $hashref->{$name} .= $hashref->{$name} ? "\0$value" : $value; } return 1; } # args: hashref of key/values # arrayref of keys in order (optional) # returns: urlencoded string sub encode_url_string { my ( $hashref, $keyref ) = @_; return join( '&', map { LJ::eurl($_) . '=' . LJ::eurl( $hashref->{$_} ) } ( ref $keyref ? @$keyref : keys %$hashref ) ); } # # name: LJ::eurl # class: text # des: Escapes a value before it can be put in a URL. See also [func[LJ::durl]]. # args: string # des-string: string to be escaped # returns: string escaped # sub eurl { my $a = $_[0]; return '' unless defined $a; $a =~ s/([^a-zA-Z0-9_\,\-.\/\\\: ])/uc sprintf("%%%02x",ord($1))/eg; $a =~ tr/ /+/; return $a; } # # name: LJ::durl # class: text # des: Decodes a value that's URL-escaped. See also [func[LJ::eurl]]. # args: string # des-string: string to be decoded # returns: string decoded # sub durl { my $a = $_[0]; return '' unless defined $a; $a =~ tr/+/ /; $a =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; return $a; } # # name: LJ::exml # class: text # des: Escapes a value before it can be put in XML. # args: string # des-string: string to be escaped # returns: string escaped. # sub exml { my $a = $_[0]; return '' unless defined $a; # fast path for the commmon case: return $a unless $a =~ /[&\"\'<>\x00-\x08\x0B\x0C\x0E-\x1F]/; # what are those character ranges? XML 1.0 allows: # #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] $a =~ s/\&/&/g; $a =~ s/\"/"/g; $a =~ s/\'/'/g; $a =~ s//>/g; $a =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F]//g; return $a; } # # name: LJ::ehtml # class: text # des: Escapes a value before it can be put in HTML. # args: string # des-string: string to be escaped # returns: string escaped. # sub ehtml { my $a = $_[0]; return '' unless defined $a; # fast path for the commmon case: return $a unless $a =~ /[&\"\'<>]/; # this is faster than doing one substitution with a map: $a =~ s/\&/&/g; $a =~ s/\"/"/g; $a =~ s/\'/&\#39;/g; $a =~ s//>/g; return $a; } *eall = \&ehtml; # old BML syntax required eall to also escape BML. not anymore. # # name: LJ::dhtml # class: text # des: Decodes a value that's HTML-escaped. See also [func[LJ::ehtml]]. # args: string # des-string: string to be decoded # returns: string decoded # sub dhtml { my $a = $_[0]; return '' unless defined $a; return HTML::Entities::decode_entities($a); } # # name: LJ::etags # class: text # des: Escapes < and > from a string # args: string # des-string: string to be escaped # returns: string escaped. # sub etags { my $a = $_[0]; return '' unless defined $a; # fast path for the commmon case: return $a unless $a =~ /[<>]/; $a =~ s//>/g; return $a; } # # name: LJ::ejs # class: text # des: Escapes a string value before it can be put in JavaScript. # args: string # des-string: string to be escaped # returns: string escaped. # sub ejs { my $a = $_[0]; return '' unless defined $a; # use zero-width lookahead to insert a backslash where needed $a =~ s/(?=[\"\'\\])/\\/g; $a =~ s/"/\\"/g; $a =~ s/\r?\n/\\n/gs; $a =~ s/\r//gs; $a =~ s/\xE2\x80[\xA8\xA9]//gs; return $a; } # given a string, makes it into a string you can put into javascript, # including protecting against closing tags in the entry. # does the double quotes for ya. sub ejs_string { my $str = ejs( $_[0] ); $str =~ s!]*\>/$1/g; $str =~ s/\<([^\<])+\>//g; return $str; } # # name: LJ::is_ascii # des: checks if text is pure ASCII. # args: text # des-text: text to check for being pure 7-bit ASCII text. # returns: 1 if text is indeed pure 7-bit, 0 otherwise. # sub is_ascii { my $text = $_[0]; return 1 unless defined $text; return ( $text !~ m/[^\x01-\x7f]/ ); } # Logs a warning if text has Perl's internal UTF8 flag set, so we can track it # down later. This is intended for debugging problems on prod that can't be # reproduced in dev. sub warn_for_perl_utf8 { my $text = $_[0]; if ( Encode::is_utf8($text) ) { cluck("MOJIBAKE ALERT: Found text with Perl UTF8 flag set!"); } } # # name: LJ::is_utf8 # des: check text for UTF-8 validity. # args: text # des-text: text to check for UTF-8 validity # returns: 1 if text is a valid UTF-8 stream, 0 otherwise. # sub is_utf8 { my $text = shift; if ( LJ::Hooks::are_hooks("is_utf8") ) { return LJ::Hooks::run_hook( "is_utf8", $text ); } require Unicode::CheckUTF8; { no strict; local $^W = 0; *stab = *{"main::LJ::"}; undef $stab{is_utf8}; } *LJ::is_utf8 = \&LJ::is_utf8_wrapper; return LJ::is_utf8_wrapper($text); } # # name: LJ::is_utf8_wrapper # des: wraps the check for UTF-8 validity. # args: text # des-text: text to check for UTF-8 validity # returns: 1 if text is a valid UTF-8 stream, a reference, or null; 0 otherwise. # sub is_utf8_wrapper { my $text = $_[0]; if ( defined $text && !ref $text && $text ) { # we need to make sure $text values are treated as strings return Unicode::CheckUTF8::is_utf8( '' . $text ); } else { # all possible "false" values for $text are valid unicode return 1; } } # # name: LJ::has_too_many # des: checks if text is too long # args: text, maxbreaks, maxchars # des-text: text to check if too long # des-maxbreaks: maximum number of linebreak # des-maxchars: maximum number of characters # returns: true if text has more than maxbreaks HTML linebreaks or more than maxchars characters # sub has_too_many { my ( $text, %opts ) = @_; return 1 if exists $opts{chars} && length($text) > $opts{chars}; if ( exists $opts{linebreaks} ) { # - we always call this on HTML, so ignore literal \n. # - paragraphs count as two linebreaks. # - this is ballpark guessing and ignores MANY things that can add # vertical space (
  • s, blockquotes...) -- shrug! my @breaks = $text =~ m!]*>!g; return 1 if scalar @breaks > $opts{linebreaks}; } return 0; } # alternate version of "lc" that handles UTF-8 # args: text string for lowercasing # returns: lowercase string sub utf8_lc { use Encode; # Perl 5.8 or higher # get the encoded text to work with my $text = decode( "UTF-8", $_[0] ); # return the lowercased text return encode( "UTF-8", lc $text ); } # # name: LJ::text_out # des: force outgoing text into valid UTF-8. # args: text # des-text: reference to text to pass to output. Text if modified in-place. # returns: nothing. # sub text_out { my $rtext = shift; # is this valid UTF-8 already? return if LJ::is_utf8($$rtext); # no. Blot out all non-ASCII chars $$rtext =~ s/[\x00\x80-\xff]/\?/g; return; } # # name: LJ::text_in # des: do appropriate checks on input text. Should be called on all # user-generated text. # args: text # des-text: text to check # returns: 1 if the text is valid, 0 if not. # sub text_in { my $text = shift; if ( ref($text) eq "HASH" ) { return !( grep { !LJ::is_utf8($_) } values %{$text} ); } if ( ref($text) eq "ARRAY" ) { return !( grep { !LJ::is_utf8($_) } @{$text} ); } return LJ::is_utf8($text); } # # name: LJ::text_convert # des: convert old entries/comments to UTF-8 using user's default encoding. # args: dbs?, text, u, error # des-dbs: optional. Deprecated; a master/slave set of database handles. # des-text: old possibly non-ASCII text to convert # des-u: user hashref of the journal's owner # des-error: ref to a scalar variable which is set to 1 on error # (when user has no default encoding defined, but # text needs to be translated). # returns: converted text or undef on error # sub text_convert { my ( $text, $u, $error ) = @_; # maybe it's pure ASCII? return $text if LJ::is_ascii($text); # load encoding id->name mapping if it's not loaded yet LJ::load_codes( { "encoding" => \%LJ::CACHE_ENCODINGS } ) unless %LJ::CACHE_ENCODINGS; if ( $u->{'oldenc'} == 0 || not defined $LJ::CACHE_ENCODINGS{ $u->{'oldenc'} } ) { $$error = 1; return undef; } # convert! my $name = $LJ::CACHE_ENCODINGS{ $u->{'oldenc'} }; unless ( LJ::ConvUTF8->supported_charset($name) ) { $$error = 1; return undef; } return LJ::ConvUTF8->to_utf8( $name, $text ); } # # name: LJ::text_length # des: returns both byte length and character length of a string. # The function assumes that its argument is a valid UTF-8 string. # args: text # des-text: the string to measure # returns: a list of two values, (byte_length, char_length). # sub text_length { my $text = shift; my $bl = length($text); my $cl = 0; my $utf_char = "([\x00-\x7f]|[\xc0-\xdf].|[\xe0-\xef]..|[\xf0-\xf7]...)"; while ( $text =~ m/$utf_char/go ) { $cl++; } return ( $bl, $cl ); } # # name: LJ::text_trim # des: truncate string according to requirements on byte length, char # length, or both. "char length" means number of UTF-8 characters. # args: text, byte_max, char_max # des-text: the string to trim # des-byte_max: maximum allowed length in bytes; if 0, there's no restriction # des-char_max: maximum allowed length in chars; if 0, there's no restriction # returns: the truncated string. # sub text_trim { my ( $text, $byte_max, $char_max, $didtrim_ref ) = @_; $text = defined $text ? LJ::trim($text) : ''; return $text unless $byte_max or $char_max; my $cur = 0; my $utf_char = "([\x00-\x7f]|[\xc0-\xdf].|[\xe0-\xef]..|[\xf0-\xf7]...)"; # if we don't have a character limit, assume it's the same as the byte limit. # we will never have more characters than bytes, but we might have more bytes # than characters, so we can't inherit the other way. $char_max ||= $byte_max; my $fake_scalar; my $ref = ref $didtrim_ref ? $didtrim_ref : \$fake_scalar; while ( $text =~ m/$utf_char/gco ) { unless ($char_max) { $$ref = 1; last; } if ( $byte_max and $cur + length($1) > $byte_max ) { $$ref = 1; last; } $cur += length($1); $char_max--; } return LJ::trim( substr( $text, 0, $cur ) ); } # # name: LJ::text_compress # des: Compresses a chunk of text, to gzip, if configured for site. Can compress # a scalarref in place, or return a compressed copy. Won't compress if # value is too small, already compressed, or size would grow by compressing. # args: text # des-text: either a scalar or scalarref # returns: nothing if given a scalarref (to compress in-place), or original/compressed value, # depending on site config. # sub text_compress { my $text = $_[0]; my $ref = ref $text; die "Invalid reference" if $ref && $ref ne "SCALAR"; my $tref = $ref ? $text : \$text; my $pre_len = length($$tref); unless ( substr( $$tref, 0, 2 ) eq "\037\213" || $pre_len < 100 ) { my $gz = Compress::Zlib::memGzip($$tref); if ( length($gz) < $pre_len ) { $$tref = $gz; } } return $ref ? undef : $$tref; } # # name: LJ::text_uncompress # des: Uncompresses a chunk of text, from gzip, if configured for site. Can uncompress # a scalarref in place, or return a compressed copy. Won't uncompress unless # it finds the gzip magic number at the beginning of the text. # args: text # des-text: either a scalar or scalarref. # returns: nothing if given a scalarref (to uncompress in-place), or original/uncompressed value, # depending on if test was compressed or not # sub text_uncompress { my $text = $_[0]; my $ref = ref $text; die "Invalid reference" if $ref && $ref ne "SCALAR"; my $tref = $ref ? $text : \$text; # check for gzip's magic number if ( substr( $$tref, 0, 2 ) eq "\037\213" ) { $$tref = Compress::Zlib::memGunzip($$tref); } return $ref ? undef : $$tref; } # function to trim a string containing HTML. this will auto-close any # html tags that were still open when the string was truncated sub html_trim { my ( $text, $char_max, $truncated ) = @_; return $text unless $char_max; my $p = HTML::TokeParser->new( \$text ); my @open_tags; # keep track of what tags are open my $out = ''; my $content_len = 0; TOKEN: while ( my $token = $p->get_token ) { my $type = $token->[0]; my $tag = $token->[1]; my $attr = $token->[2]; # hashref if ( $type eq "S" ) { my $selfclose; # start tag $out .= "<$tag"; # assume tags are properly self-closed $selfclose = 1 if lc $tag eq 'input' || lc $tag eq 'br' || lc $tag eq 'img'; # preserve order of attributes. the original order is # in element 4 of $token foreach my $attrname ( @{ $token->[3] } ) { if ( $attrname eq '/' ) { $selfclose = 1; next; } # FIXME: neaten $attr->{$attrname} = LJ::no_utf8_flag( $attr->{$attrname} ); $out .= " $attrname=\"" . LJ::ehtml( $attr->{$attrname} ) . "\""; } $out .= $selfclose ? " />" : ">"; push @open_tags, $tag unless $selfclose; } elsif ( $type eq 'T' || $type eq 'D' ) { my $content = $token->[1]; if ( length($content) + $content_len > $char_max ) { # truncate and stop parsing $content = LJ::text_trim( $content, undef, ( $char_max - $content_len ) ); $out .= $content; $$truncated = 1 if ref $truncated; last; } $content_len += length $content; $out .= $content; } elsif ( $type eq 'C' ) { # comment, don't care $out .= $token->[1]; } elsif ( $type eq 'E' ) { # end tag if ( $open_tags[-1] eq $tag ) { pop @open_tags; $out .= ""; } } } $out .= join( "\n", map { "" } reverse @open_tags ); return $out; } # takes a number, inserts commas where needed sub commafy { my $number = $_[0]; return '' unless defined $number; return $number unless $number =~ /^\d+$/; my $punc = LJ::Lang::ml('number.punctuation') || ","; $number =~ s/(?<=\d)(?=(\d\d\d)+(?!\d))/$punc/g; return $number; } # # name: LJ::html_newlines # des: Replace newlines with HTML break tags. # args: text # returns: text, possibly including HTML break tags. # sub html_newlines { my $text = $_[0]; return '' unless defined $text; $text =~ s/\n/
    /gm; return $text; } # prepend ">" to each line of text to make a blockquote in markdown # for when text has multiple lines and prepending ">" to the entire # text will just convert the first line / paragraph sub markdown_blockquote { my $text = $_[0]; return '' unless defined $text; $text =~ s/(^.*)/\> $1/gm; return $text; } 1;