mourningdove/cgi-bin/LJ/TextUtil.pm

724 lines
19 KiB
Perl
Raw Normal View History

2026-05-24 01:03:05 +00:00
# This code was forked from the LiveJournal project owned and operated
# by Live Journal, Inc. The code has been modified and expanded by
# Dreamwidth Studios, LLC. These files were originally licensed under
# the terms of the license supplied by Live Journal, Inc, which can
# currently be found at:
#
# http://code.livejournal.org/trac/livejournal/browser/trunk/LICENSE-LiveJournal.txt
#
# In accordance with the original license, this code and all its
# modifications are provided under the GNU General Public License.
# A copy of that license can be found in the LICENSE file included as
# part of this distribution.
package LJ;
use strict;
use LJ::ConvUTF8;
use HTML::TokeParser;
use HTML::Entities;
use Carp qw(cluck);
use Encode;
# <LJFUNC>
# name: LJ::trim
# class: text
# des: Removes whitespace from left and right side of a string.
# args: string
# des-string: string to be trimmed
# returns: trimmed string
# </LJFUNC>
sub trim {
my $a = $_[0];
return '' unless defined $a;
$a =~ s/^\s+//;
$a =~ s/\s+$//;
return $a;
}
# check argument text for see_request links, and strip any auth args
sub strip_request_auth {
my $a = $_[0];
return '' unless defined $a;
$a =~ s/(see_request\S+?)\&auth=\w+/$1/ig;
return $a;
}
# <LJFUNC>
# name: LJ::get_urls
# class: text
# des: Returns a list of all referenced URLs from a string.
# args: text
# des-text: Text from which to return extra URLs.
# returns: list of URLs
# </LJFUNC>
sub get_urls {
return ( $_[0] =~ m!https?://[^\s\"\'\<\>]+!g );
}
# similar to decode_url_string below, but a nicer calling convention. returns
# a hash of items parsed from the string passed in as the only argument.
# FIXME: This method using \0 is being used in legacy locations
# however should be factored out ( to Hash::MultiValue )
# as soon as the need for the legacy use is removed.
sub parse_args {
my $args = $_[0];
return unless defined $args;
my %GET;
foreach my $pair ( split /&/, $args ) {
my ( $name, $value ) = split /=/, $pair;
if ( defined $value ) {
$value =~ tr/+/ /;
$value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
}
else {
$value = '';
}
$name =~ tr/+/ /;
$name =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
$GET{$name} .= $GET{$name} ? "\0$value" : $value;
}
return %GET;
}
# <LJFUNC>
# name: LJ::decode_url_string
# class: web
# des: Parse URL-style arg/value pairs into a hash.
# args: buffer, hashref
# des-buffer: Scalar or scalarref of buffer to parse.
# des-hashref: Hashref to populate.
# returns: boolean; true.
# </LJFUNC>
sub decode_url_string {
my $a = shift;
my $buffer = ref $a ? $a : \$a;
my $hashref = shift; # output hash
my $keyref = shift; # array of keys as they were found
my $pair;
my @pairs = split( /&/, $$buffer );
@$keyref = @pairs;
my ( $name, $value );
foreach $pair (@pairs) {
( $name, $value ) = split( /=/, $pair );
$value =~ tr/+/ /;
$value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
$name =~ tr/+/ /;
$name =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
$hashref->{$name} .= $hashref->{$name} ? "\0$value" : $value;
}
return 1;
}
# args: hashref of key/values
# arrayref of keys in order (optional)
# returns: urlencoded string
sub encode_url_string {
my ( $hashref, $keyref ) = @_;
return join( '&',
map { LJ::eurl($_) . '=' . LJ::eurl( $hashref->{$_} ) }
( ref $keyref ? @$keyref : keys %$hashref ) );
}
# <LJFUNC>
# name: LJ::eurl
# class: text
# des: Escapes a value before it can be put in a URL. See also [func[LJ::durl]].
# args: string
# des-string: string to be escaped
# returns: string escaped
# </LJFUNC>
sub eurl {
my $a = $_[0];
return '' unless defined $a;
$a =~ s/([^a-zA-Z0-9_\,\-.\/\\\: ])/uc sprintf("%%%02x",ord($1))/eg;
$a =~ tr/ /+/;
return $a;
}
# <LJFUNC>
# name: LJ::durl
# class: text
# des: Decodes a value that's URL-escaped. See also [func[LJ::eurl]].
# args: string
# des-string: string to be decoded
# returns: string decoded
# </LJFUNC>
sub durl {
my $a = $_[0];
return '' unless defined $a;
$a =~ tr/+/ /;
$a =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
return $a;
}
# <LJFUNC>
# name: LJ::exml
# class: text
# des: Escapes a value before it can be put in XML.
# args: string
# des-string: string to be escaped
# returns: string escaped.
# </LJFUNC>
sub exml {
my $a = $_[0];
return '' unless defined $a;
# fast path for the commmon case:
return $a unless $a =~ /[&\"\'<>\x00-\x08\x0B\x0C\x0E-\x1F]/;
# what are those character ranges? XML 1.0 allows:
# #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
$a =~ s/\&/&amp;/g;
$a =~ s/\"/&quot;/g;
$a =~ s/\'/&apos;/g;
$a =~ s/</&lt;/g;
$a =~ s/>/&gt;/g;
$a =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F]//g;
return $a;
}
# <LJFUNC>
# name: LJ::ehtml
# class: text
# des: Escapes a value before it can be put in HTML.
# args: string
# des-string: string to be escaped
# returns: string escaped.
# </LJFUNC>
sub ehtml {
my $a = $_[0];
return '' unless defined $a;
# fast path for the commmon case:
return $a unless $a =~ /[&\"\'<>]/;
# this is faster than doing one substitution with a map:
$a =~ s/\&/&amp;/g;
$a =~ s/\"/&quot;/g;
$a =~ s/\'/&\#39;/g;
$a =~ s/</&lt;/g;
$a =~ s/>/&gt;/g;
return $a;
}
*eall = \&ehtml; # old BML syntax required eall to also escape BML. not anymore.
# <LJFUNC>
# name: LJ::dhtml
# class: text
# des: Decodes a value that's HTML-escaped. See also [func[LJ::ehtml]].
# args: string
# des-string: string to be decoded
# returns: string decoded
# </LJFUNC>
sub dhtml {
my $a = $_[0];
return '' unless defined $a;
return HTML::Entities::decode_entities($a);
}
# <LJFUNC>
# name: LJ::etags
# class: text
# des: Escapes < and > from a string
# args: string
# des-string: string to be escaped
# returns: string escaped.
# </LJFUNC>
sub etags {
my $a = $_[0];
return '' unless defined $a;
# fast path for the commmon case:
return $a unless $a =~ /[<>]/;
$a =~ s/</&lt;/g;
$a =~ s/>/&gt;/g;
return $a;
}
# <LJFUNC>
# name: LJ::ejs
# class: text
# des: Escapes a string value before it can be put in JavaScript.
# args: string
# des-string: string to be escaped
# returns: string escaped.
# </LJFUNC>
sub ejs {
my $a = $_[0];
return '' unless defined $a;
# use zero-width lookahead to insert a backslash where needed
$a =~ s/(?=[\"\'\\])/\\/g;
$a =~ s/&quot;/\\&quot;/g;
$a =~ s/\r?\n/\\n/gs;
$a =~ s/\r//gs;
$a =~ s/\xE2\x80[\xA8\xA9]//gs;
return $a;
}
# given a string, makes it into a string you can put into javascript,
# including protecting against closing </script> tags in the entry.
# does the double quotes for ya.
sub ejs_string {
my $str = ejs( $_[0] );
$str =~ s!</script!</scri\" + \"pt!gi;
return "\"" . $str . "\"";
}
# changes every char in a string to %XX where XX is the hex value
# this is useful for passing strings to javascript through HTML, because
# javascript's "unescape" function expects strings in this format
sub ejs_all {
my $a = $_[0];
return '' unless defined $a;
$a =~ s/(.)/uc sprintf("%%%02x",ord($1))/eg;
return $a;
}
# strip all HTML tags from a string
sub strip_html {
my $str = $_[0];
return '' unless defined $str;
$str =~
s/\<(?:lj(?: site=[^\s]+)? user|user(?: site=[^\s]+)? name)\=['"]?([\w-]+)['"]?[^>]*\>/$1/g;
$str =~ s/\<([^\<])+\>//g;
return $str;
}
# <LJFUNC>
# name: LJ::is_ascii
# des: checks if text is pure ASCII.
# args: text
# des-text: text to check for being pure 7-bit ASCII text.
# returns: 1 if text is indeed pure 7-bit, 0 otherwise.
# </LJFUNC>
sub is_ascii {
my $text = $_[0];
return 1 unless defined $text;
return ( $text !~ m/[^\x01-\x7f]/ );
}
# Logs a warning if text has Perl's internal UTF8 flag set, so we can track it
# down later. This is intended for debugging problems on prod that can't be
# reproduced in dev.
sub warn_for_perl_utf8 {
my $text = $_[0];
if ( Encode::is_utf8($text) ) {
cluck("MOJIBAKE ALERT: Found text with Perl UTF8 flag set!");
}
}
# <LJFUNC>
# name: LJ::is_utf8
# des: check text for UTF-8 validity.
# args: text
# des-text: text to check for UTF-8 validity
# returns: 1 if text is a valid UTF-8 stream, 0 otherwise.
# </LJFUNC>
sub is_utf8 {
my $text = shift;
if ( LJ::Hooks::are_hooks("is_utf8") ) {
return LJ::Hooks::run_hook( "is_utf8", $text );
}
require Unicode::CheckUTF8;
{
no strict;
local $^W = 0;
*stab = *{"main::LJ::"};
undef $stab{is_utf8};
}
*LJ::is_utf8 = \&LJ::is_utf8_wrapper;
return LJ::is_utf8_wrapper($text);
}
# <LJFUNC>
# name: LJ::is_utf8_wrapper
# des: wraps the check for UTF-8 validity.
# args: text
# des-text: text to check for UTF-8 validity
# returns: 1 if text is a valid UTF-8 stream, a reference, or null; 0 otherwise.
# </LJFUNC>
sub is_utf8_wrapper {
my $text = $_[0];
if ( defined $text && !ref $text && $text ) {
# we need to make sure $text values are treated as strings
return Unicode::CheckUTF8::is_utf8( '' . $text );
}
else {
# all possible "false" values for $text are valid unicode
return 1;
}
}
# <LJFUNC>
# name: LJ::has_too_many
# des: checks if text is too long
# args: text, maxbreaks, maxchars
# des-text: text to check if too long
# des-maxbreaks: maximum number of linebreak
# des-maxchars: maximum number of characters
# returns: true if text has more than maxbreaks HTML linebreaks or more than maxchars characters
# </LJFUNC>
sub has_too_many {
my ( $text, %opts ) = @_;
return 1 if exists $opts{chars} && length($text) > $opts{chars};
if ( exists $opts{linebreaks} ) {
# - we always call this on HTML, so ignore literal \n.
# - paragraphs count as two linebreaks.
# - this is ballpark guessing and ignores MANY things that can add
# vertical space (<li>s, blockquotes...) -- shrug!
my @breaks = $text =~ m!</?(?:p|br)[^>]*>!g;
return 1 if scalar @breaks > $opts{linebreaks};
}
return 0;
}
# alternate version of "lc" that handles UTF-8
# args: text string for lowercasing
# returns: lowercase string
sub utf8_lc {
use Encode; # Perl 5.8 or higher
# get the encoded text to work with
my $text = decode( "UTF-8", $_[0] );
# return the lowercased text
return encode( "UTF-8", lc $text );
}
# <LJFUNC>
# name: LJ::text_out
# des: force outgoing text into valid UTF-8.
# args: text
# des-text: reference to text to pass to output. Text if modified in-place.
# returns: nothing.
# </LJFUNC>
sub text_out {
my $rtext = shift;
# is this valid UTF-8 already?
return if LJ::is_utf8($$rtext);
# no. Blot out all non-ASCII chars
$$rtext =~ s/[\x00\x80-\xff]/\?/g;
return;
}
# <LJFUNC>
# name: LJ::text_in
# des: do appropriate checks on input text. Should be called on all
# user-generated text.
# args: text
# des-text: text to check
# returns: 1 if the text is valid, 0 if not.
# </LJFUNC>
sub text_in {
my $text = shift;
if ( ref($text) eq "HASH" ) {
return !( grep { !LJ::is_utf8($_) } values %{$text} );
}
if ( ref($text) eq "ARRAY" ) {
return !( grep { !LJ::is_utf8($_) } @{$text} );
}
return LJ::is_utf8($text);
}
# <LJFUNC>
# name: LJ::text_convert
# des: convert old entries/comments to UTF-8 using user's default encoding.
# args: dbs?, text, u, error
# des-dbs: optional. Deprecated; a master/slave set of database handles.
# des-text: old possibly non-ASCII text to convert
# des-u: user hashref of the journal's owner
# des-error: ref to a scalar variable which is set to 1 on error
# (when user has no default encoding defined, but
# text needs to be translated).
# returns: converted text or undef on error
# </LJFUNC>
sub text_convert {
my ( $text, $u, $error ) = @_;
# maybe it's pure ASCII?
return $text if LJ::is_ascii($text);
# load encoding id->name mapping if it's not loaded yet
LJ::load_codes( { "encoding" => \%LJ::CACHE_ENCODINGS } )
unless %LJ::CACHE_ENCODINGS;
if ( $u->{'oldenc'} == 0
|| not defined $LJ::CACHE_ENCODINGS{ $u->{'oldenc'} } )
{
$$error = 1;
return undef;
}
# convert!
my $name = $LJ::CACHE_ENCODINGS{ $u->{'oldenc'} };
unless ( LJ::ConvUTF8->supported_charset($name) ) {
$$error = 1;
return undef;
}
return LJ::ConvUTF8->to_utf8( $name, $text );
}
# <LJFUNC>
# name: LJ::text_length
# des: returns both byte length and character length of a string.
# The function assumes that its argument is a valid UTF-8 string.
# args: text
# des-text: the string to measure
# returns: a list of two values, (byte_length, char_length).
# </LJFUNC>
sub text_length {
my $text = shift;
my $bl = length($text);
my $cl = 0;
my $utf_char = "([\x00-\x7f]|[\xc0-\xdf].|[\xe0-\xef]..|[\xf0-\xf7]...)";
while ( $text =~ m/$utf_char/go ) { $cl++; }
return ( $bl, $cl );
}
# <LJFUNC>
# name: LJ::text_trim
# des: truncate string according to requirements on byte length, char
# length, or both. "char length" means number of UTF-8 characters.
# args: text, byte_max, char_max
# des-text: the string to trim
# des-byte_max: maximum allowed length in bytes; if 0, there's no restriction
# des-char_max: maximum allowed length in chars; if 0, there's no restriction
# returns: the truncated string.
# </LJFUNC>
sub text_trim {
my ( $text, $byte_max, $char_max, $didtrim_ref ) = @_;
$text = defined $text ? LJ::trim($text) : '';
return $text unless $byte_max or $char_max;
my $cur = 0;
my $utf_char = "([\x00-\x7f]|[\xc0-\xdf].|[\xe0-\xef]..|[\xf0-\xf7]...)";
# if we don't have a character limit, assume it's the same as the byte limit.
# we will never have more characters than bytes, but we might have more bytes
# than characters, so we can't inherit the other way.
$char_max ||= $byte_max;
my $fake_scalar;
my $ref = ref $didtrim_ref ? $didtrim_ref : \$fake_scalar;
while ( $text =~ m/$utf_char/gco ) {
unless ($char_max) {
$$ref = 1;
last;
}
if ( $byte_max and $cur + length($1) > $byte_max ) {
$$ref = 1;
last;
}
$cur += length($1);
$char_max--;
}
return LJ::trim( substr( $text, 0, $cur ) );
}
# <LJFUNC>
# name: LJ::text_compress
# des: Compresses a chunk of text, to gzip, if configured for site. Can compress
# a scalarref in place, or return a compressed copy. Won't compress if
# value is too small, already compressed, or size would grow by compressing.
# args: text
# des-text: either a scalar or scalarref
# returns: nothing if given a scalarref (to compress in-place), or original/compressed value,
# depending on site config.
# </LJFUNC>
sub text_compress {
my $text = $_[0];
my $ref = ref $text;
die "Invalid reference" if $ref && $ref ne "SCALAR";
my $tref = $ref ? $text : \$text;
my $pre_len = length($$tref);
unless ( substr( $$tref, 0, 2 ) eq "\037\213" || $pre_len < 100 ) {
my $gz = Compress::Zlib::memGzip($$tref);
if ( length($gz) < $pre_len ) {
$$tref = $gz;
}
}
return $ref ? undef : $$tref;
}
# <LJFUNC>
# name: LJ::text_uncompress
# des: Uncompresses a chunk of text, from gzip, if configured for site. Can uncompress
# a scalarref in place, or return a compressed copy. Won't uncompress unless
# it finds the gzip magic number at the beginning of the text.
# args: text
# des-text: either a scalar or scalarref.
# returns: nothing if given a scalarref (to uncompress in-place), or original/uncompressed value,
# depending on if test was compressed or not
# </LJFUNC>
sub text_uncompress {
my $text = $_[0];
my $ref = ref $text;
die "Invalid reference" if $ref && $ref ne "SCALAR";
my $tref = $ref ? $text : \$text;
# check for gzip's magic number
if ( substr( $$tref, 0, 2 ) eq "\037\213" ) {
$$tref = Compress::Zlib::memGunzip($$tref);
}
return $ref ? undef : $$tref;
}
# function to trim a string containing HTML. this will auto-close any
# html tags that were still open when the string was truncated
sub html_trim {
my ( $text, $char_max, $truncated ) = @_;
return $text unless $char_max;
my $p = HTML::TokeParser->new( \$text );
my @open_tags; # keep track of what tags are open
my $out = '';
my $content_len = 0;
TOKEN:
while ( my $token = $p->get_token ) {
my $type = $token->[0];
my $tag = $token->[1];
my $attr = $token->[2]; # hashref
if ( $type eq "S" ) {
my $selfclose;
# start tag
$out .= "<$tag";
# assume tags are properly self-closed
$selfclose = 1 if lc $tag eq 'input' || lc $tag eq 'br' || lc $tag eq 'img';
# preserve order of attributes. the original order is
# in element 4 of $token
foreach my $attrname ( @{ $token->[3] } ) {
if ( $attrname eq '/' ) {
$selfclose = 1;
next;
}
# FIXME: neaten
$attr->{$attrname} = LJ::no_utf8_flag( $attr->{$attrname} );
$out .= " $attrname=\"" . LJ::ehtml( $attr->{$attrname} ) . "\"";
}
$out .= $selfclose ? " />" : ">";
push @open_tags, $tag unless $selfclose;
}
elsif ( $type eq 'T' || $type eq 'D' ) {
my $content = $token->[1];
if ( length($content) + $content_len > $char_max ) {
# truncate and stop parsing
$content = LJ::text_trim( $content, undef, ( $char_max - $content_len ) );
$out .= $content;
$$truncated = 1 if ref $truncated;
last;
}
$content_len += length $content;
$out .= $content;
}
elsif ( $type eq 'C' ) {
# comment, don't care
$out .= $token->[1];
}
elsif ( $type eq 'E' ) {
# end tag
if ( $open_tags[-1] eq $tag ) {
pop @open_tags;
$out .= "</$tag>";
}
}
}
$out .= join( "\n", map { "</$_>" } reverse @open_tags );
return $out;
}
# takes a number, inserts commas where needed
sub commafy {
my $number = $_[0];
return '' unless defined $number;
return $number unless $number =~ /^\d+$/;
my $punc = LJ::Lang::ml('number.punctuation') || ",";
$number =~ s/(?<=\d)(?=(\d\d\d)+(?!\d))/$punc/g;
return $number;
}
# <LJFUNC>
# name: LJ::html_newlines
# des: Replace newlines with HTML break tags.
# args: text
# returns: text, possibly including HTML break tags.
# </LJFUNC>
sub html_newlines {
my $text = $_[0];
return '' unless defined $text;
$text =~ s/\n/<br \/>/gm;
return $text;
}
# prepend ">" to each line of text to make a blockquote in markdown
# for when text has multiple lines and prepending ">" to the entire
# text will just convert the first line / paragraph
sub markdown_blockquote {
my $text = $_[0];
return '' unless defined $text;
$text =~ s/(^.*)/\> $1/gm;
return $text;
}
1;