289 lines
13 KiB
Ruby
289 lines
13 KiB
Ruby
# Use css parser to break up style blocks
|
|
require "css_parser"
|
|
|
|
module CssCleaner
|
|
include CssParser
|
|
|
|
# constant regexps for css values
|
|
ALPHA_REGEX = Regexp.new('[a-z\-]+')
|
|
UNITS_REGEX = Regexp.new('deg|cm|em|ex|in|mm|pc|pt|px|s|%', Regexp::IGNORECASE)
|
|
NUMBER_REGEX = Regexp.new('-?\.?\d{1,3}\.?\d{0,3}')
|
|
NUMBER_WITH_UNIT_REGEX = Regexp.new("#{NUMBER_REGEX}\s*#{UNITS_REGEX}?\s*,?\s*")
|
|
PAREN_NUMBER_REGEX = Regexp.new('\(\s*' + NUMBER_WITH_UNIT_REGEX.to_s + '+\s*\)')
|
|
PREFIX_REGEX = Regexp.new('moz|ms|o|webkit')
|
|
|
|
FUNCTION_NAME_REGEX = Regexp.new('scalex?y?|translatex?y?|skewx?y?|rotatex?y?|matrix', Regexp::IGNORECASE)
|
|
TRANSFORM_FUNCTION_REGEX = Regexp.new("#{FUNCTION_NAME_REGEX}#{PAREN_NUMBER_REGEX}")
|
|
|
|
SHAPE_NAME_REGEX = Regexp.new('rect', Regexp::IGNORECASE)
|
|
SHAPE_FUNCTION_REGEX = Regexp.new("#{SHAPE_NAME_REGEX}#{PAREN_NUMBER_REGEX}")
|
|
|
|
RGBA_REGEX = Regexp.new("rgba?" + PAREN_NUMBER_REGEX.to_s, Regexp::IGNORECASE)
|
|
HSLA_REGEX = Regexp.new("hsla?" + PAREN_NUMBER_REGEX.to_s, Regexp::IGNORECASE)
|
|
COLOR_REGEX = Regexp.new("#[0-9a-f]{3,6}|" + ALPHA_REGEX.to_s + "|" + RGBA_REGEX.to_s + "|" + HSLA_REGEX.to_s)
|
|
COLOR_STOP_FUNCTION_REGEX = Regexp.new('color-stop\s*\(' + NUMBER_WITH_UNIT_REGEX.to_s + '\s*\,?\s*' + COLOR_REGEX.to_s + '\s*\)', Regexp::IGNORECASE)
|
|
|
|
# list of filter functions can be found at https://developer.mozilla.org/en-US/docs/Web/CSS/filter#syntax
|
|
FILTER_NAME_REGEX = Regexp.new("blur|brightness|contrast|grayscale|hue-rotate|invert|opacity|saturate|sepia", Regexp::IGNORECASE)
|
|
FILTER_FUNCTION_REGEX = Regexp.new("#{FILTER_NAME_REGEX}#{PAREN_NUMBER_REGEX}")
|
|
|
|
# drop-shadow can take multiple values, which are a mix of numbers and colors
|
|
DROP_SHADOW_NAME_REGEX = Regexp.new("drop-shadow", Regexp::IGNORECASE)
|
|
DROP_SHADOW_VALUE_REGEX = Regexp.new("\\(\\s*(#{NUMBER_WITH_UNIT_REGEX}|#{COLOR_REGEX}\\s*)+\\s*\\)")
|
|
DROP_SHADOW_FUNCTION_REGEX = Regexp.new("#{DROP_SHADOW_NAME_REGEX}#{DROP_SHADOW_VALUE_REGEX}")
|
|
|
|
# Custom properties (variables) are declared using --name: value and accessed
|
|
# using property: var(--name). The var() function can be more complex, e.g.,
|
|
# var(--name, fallback value), but we're keeping our implementation simple.
|
|
CUSTOM_PROPERTY_NAME_REGEXP = Regexp.new("\\-\\-[0-9a-z\\-_]+", Regexp::IGNORECASE)
|
|
PAREN_CUSTOM_PROPERTY_REGEX = Regexp.new("\\(\\s*#{CUSTOM_PROPERTY_NAME_REGEXP}\\s*\\)", Regexp::IGNORECASE)
|
|
VAR_FUNCTION_REGEX = Regexp.new("var#{PAREN_CUSTOM_PROPERTY_REGEX}", Regexp::IGNORECASE)
|
|
|
|
# To allow the url() function, it is also necessary to include "url" in ArchiveConfig.SUPPORTED_CSS_KEYWORDS
|
|
# from the ICANN list at http://www.icann.org/en/registries/top-level-domains.htm
|
|
TOP_LEVEL_DOMAINS = %w(ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl sm sn so sr st su sv sy sz tc td tel tf tg th tj tk tl tm tn to tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws xn xxx ye yt za zm zw)
|
|
DOMAIN_REGEX = Regexp.new('https?://\w[\w\-\.]+\.(' + TOP_LEVEL_DOMAINS.join('|') + ')')
|
|
DOMAIN_OR_IMAGES_REGEX = Regexp.new('\/images|' + DOMAIN_REGEX.to_s)
|
|
URI_REGEX = Regexp.new(DOMAIN_OR_IMAGES_REGEX.to_s + '/[\w\-\.\/]*[\w\-]\.(' + ArchiveConfig.SUPPORTED_EXTERNAL_URLS.join('|') + ')')
|
|
URL_REGEX = Regexp.new(URI_REGEX.to_s + '|"' + URI_REGEX.to_s + '"|\'' + URI_REGEX.to_s + '\'')
|
|
URL_FUNCTION_REGEX = Regexp.new('url\(\s*' + URL_REGEX.to_s + '\s*\)')
|
|
|
|
VALUE_REGEX = Regexp.new("#{TRANSFORM_FUNCTION_REGEX}|#{URL_FUNCTION_REGEX}|#{COLOR_STOP_FUNCTION_REGEX}|#{COLOR_REGEX}|#{NUMBER_WITH_UNIT_REGEX}|#{ALPHA_REGEX}|#{SHAPE_FUNCTION_REGEX}|#{FILTER_FUNCTION_REGEX}|#{DROP_SHADOW_FUNCTION_REGEX}|#{VAR_FUNCTION_REGEX}")
|
|
|
|
|
|
# For use in ActiveRecord models
|
|
# We parse and clean the CSS line by line in order to provide more helpful error messages.
|
|
# The prefix is used if you want to make sure a particular prefix appears on all the selectors in
|
|
# this block of css, eg ".userstuff p" instead of just "p"
|
|
def clean_css_code(css_code, options = {})
|
|
return "" if !css_code.match(/\w/) # only spaces of various kinds
|
|
clean_css = ""
|
|
parser = CssParser::Parser.new
|
|
parser.add_block!(css_code)
|
|
|
|
prefix = options[:prefix] || ''
|
|
caller_check = options[:caller_check]
|
|
|
|
errors.add(:base, :no_valid_css) if parser.to_s.blank?
|
|
|
|
parser.each_rule_set do |rs|
|
|
selectors = rs.selectors.map do |selector|
|
|
if selector.match(/@font-face/i)
|
|
errors.add(:base, :font_face)
|
|
next
|
|
end
|
|
# remove whitespace and convert > entities back to the > direct child selector
|
|
sel = selector.gsub(/\n/, "").gsub(">", ">").strip
|
|
(prefix.blank? || sel.start_with?(prefix)) ? sel : "#{prefix} #{sel}"
|
|
end
|
|
clean_declarations = ""
|
|
# Do not internationalize the , used as a join in these errors -- it's reflective of the comma used in the list of selectors, which does not change based on locale.
|
|
rs.each_declaration do |property, value, is_important|
|
|
if property.blank? || value.blank?
|
|
errors.add(:base, :no_valid_css_for_selectors, selectors: rs.selectors.join(", "))
|
|
elsif sanitize_css_property(property).blank?
|
|
# If it starts with --, assume the user was trying to define a custom property.
|
|
if property.match(/\A--/)
|
|
errors.add(:base, :invalid_custom_property_name, property: property, selectors: rs.selectors.join(", "))
|
|
else
|
|
errors.add(:base, :banned_property, property: property)
|
|
end
|
|
elsif (cleanval = sanitize_css_declaration_value(property, value)).blank?
|
|
errors.add(:base, :banned_value_for_property, property: property, selectors: rs.selectors.join(", "), value: value)
|
|
elsif !caller_check || caller_check.call(rs, property, value)
|
|
clean_declarations += " #{property}: #{cleanval}#{is_important ? ' !important' : ''};\n"
|
|
end
|
|
end
|
|
if clean_declarations.blank?
|
|
errors.add(:base, :no_rules_for_selectors, selectors: rs.selectors.join(", "))
|
|
else
|
|
# everything looks ok, add it to the css
|
|
clean_css += "#{selectors.join(",\n")} {\n"
|
|
clean_css += clean_declarations
|
|
clean_css += "}\n\n"
|
|
end
|
|
end
|
|
return clean_css
|
|
end
|
|
|
|
def legal_property?(property)
|
|
ArchiveConfig.SUPPORTED_CSS_PROPERTIES.include?(property) ||
|
|
property.match(/-(#{PREFIX_REGEX})-(#{ArchiveConfig.SUPPORTED_CSS_PROPERTIES.join('|')})/)
|
|
end
|
|
|
|
def legal_shorthand_property?(property)
|
|
property.match(/#{ArchiveConfig.SUPPORTED_CSS_SHORTHAND_PROPERTIES.join('|')}/)
|
|
end
|
|
|
|
def custom_property?(property)
|
|
property.match(/\A(#{CUSTOM_PROPERTY_NAME_REGEXP})\z/)
|
|
end
|
|
|
|
def sanitize_css_property(property)
|
|
return property if legal_property?(property) || legal_shorthand_property?(property) || custom_property?(property)
|
|
end
|
|
|
|
# A declaration must match the format `property: value;` (space and semicolon
|
|
# are optional in user input).
|
|
# All properties must appear in ArchiveConfig.SUPPORTED_CSS_PROPERTIES or
|
|
# ArchiveConfig.SUPPORTED_CSS_SHORTHAND_PROPERTIES, or that property and its
|
|
# value will be removed and an error message will be given.
|
|
# All values are sanitized. If any values in a declaration are invalid, the
|
|
# value will be blanked out and an empty property returned, which will result
|
|
# in an error.
|
|
def sanitize_css_declaration_value(property, value)
|
|
clean = ""
|
|
if property == "font-family"
|
|
# preserve the original capitalization
|
|
clean = value if sanitize_css_font(value).present?
|
|
elsif property == "content"
|
|
# don't allow var() function
|
|
clean = value.match(/\bvar\b/i) ? "" : sanitize_css_content(value)
|
|
# The url() function can be used in the values for certain properties,
|
|
# provided "url" is included in ArchiveConfig.SUPPORTED_CSS_KEYWORDS. If
|
|
# those criteria are not met, we strip the value here. If they are met, the
|
|
# value will undergo sanitization in tokenize_and_sanitize_css_value or
|
|
# sanitize_css_value.
|
|
elsif value.match(/\burl\b/i) && (ArchiveConfig.SUPPORTED_CSS_KEYWORDS.exclude?("url") || %w[background background-image border border-image list-style list-style-image].exclude?(property))
|
|
clean = ""
|
|
elsif legal_shorthand_property?(property) || custom_property?(property)
|
|
clean = tokenize_and_sanitize_css_value(value)
|
|
elsif legal_property?(property)
|
|
clean = sanitize_css_value(value)
|
|
end
|
|
clean.strip
|
|
end
|
|
|
|
# divide a css value into tokens and clean them individually
|
|
def tokenize_and_sanitize_css_value(value)
|
|
cleanval = ""
|
|
scanner = StringScanner.new(value)
|
|
|
|
# we scan until we find either a space, a comma, or an open parenthesis
|
|
while scanner.exist?(/\s+|,|\(/)
|
|
# we have some tokens left to break up
|
|
in_paren = 0
|
|
token = scanner.scan_until(/\s+|,|\(/)
|
|
if token.blank? || token == ","
|
|
cleanval += token
|
|
next
|
|
end
|
|
in_paren = 1 if token.match(/\($/)
|
|
while in_paren > 0
|
|
# scan until closing paren or another opening paren
|
|
nextpart = scanner.scan_until(/\(|\)/)
|
|
if nextpart
|
|
token += nextpart
|
|
in_paren += 1 if token.match(/\($/)
|
|
in_paren -= 1 if token.match(/\)$/)
|
|
else
|
|
# mismatched parens
|
|
return ""
|
|
end
|
|
end
|
|
|
|
# we now have a single token
|
|
separator = token.match(/(\s|,)$/) || ""
|
|
token.strip!
|
|
token.chomp!(',')
|
|
cleantoken = sanitize_css_token(token)
|
|
return "" if cleantoken.blank?
|
|
cleanval += cleantoken + separator.to_s
|
|
end
|
|
|
|
token = scanner.rest
|
|
if token && !token.blank?
|
|
cleantoken = sanitize_css_token(token)
|
|
return "" if cleantoken.blank?
|
|
cleanval += cleantoken
|
|
end
|
|
|
|
return cleanval
|
|
end
|
|
|
|
def sanitize_css_token(token)
|
|
if token.match?(/gradient/)
|
|
sanitize_css_gradient(token)
|
|
else
|
|
sanitize_css_value(token)
|
|
end
|
|
end
|
|
|
|
# sanitize a CSS gradient
|
|
# background:-webkit-gradient( linear, left bottom, left top, color-stop(0, rgb(82,82,82)), color-stop(1, rgb(125,124,125)));
|
|
# -moz-linear-gradient(bottom, rgba(120,120,120,1) 5%, rgba(94,94,94,1) 50%, rgba(108,108,108,1) 55%, rgba(137,137,137,1) 100%);
|
|
def sanitize_css_gradient(value)
|
|
if value.match(/^([a-z\-]+)\((.*)\)/)
|
|
function = $1
|
|
interior = $2
|
|
cleaned_interior = tokenize_and_sanitize_css_value(interior)
|
|
if function.match(/gradient/) && !cleaned_interior.blank?
|
|
return "#{function}(#{cleaned_interior})"
|
|
end
|
|
end
|
|
return ""
|
|
end
|
|
|
|
# All values must be either
|
|
# - in ArchiveConfig.SUPPORTED_CSS_KEYWORDS
|
|
# - URLs of the format url(http://url/)
|
|
# - rgba(), hsla(), hex, or named colors
|
|
# - numeric values
|
|
# - transform, shape, filter, drop shadow, or variable functions
|
|
# Comma-separated lists of these values are also allowed.
|
|
def sanitize_css_value(value)
|
|
value_stripped = strip_value(value)
|
|
|
|
# If it's a comma-separated set of valid values, it's fine. However, we need
|
|
# to downcase any var() functions to match the css_parser gem's downcasing
|
|
# of property names.
|
|
if value_stripped.match?(/^(#{VALUE_REGEX},?\s*)+$/i)
|
|
return value unless value.match?(/#{VAR_FUNCTION_REGEX}/)
|
|
|
|
return value.gsub(/#{VAR_FUNCTION_REGEX}/, &:downcase)
|
|
end
|
|
|
|
# If the value is explicitly in our list of supported keywords, it's fine.
|
|
# However, note that !important is always allowed (refer to the comments on
|
|
# strip_value(value) and ArchiveConfig.SUPPORTED_CSS_KEYWORDS for more), and
|
|
# that the url() function is allowed by the VALUE_REGEX above. Excluding
|
|
# url() from SUPPORTED_CSS_KEYWORDS only strips it because of the check in
|
|
# sanitize_css_declaration_value.
|
|
return value if value_stripped.split(",").all? { |subval| ArchiveConfig.SUPPORTED_CSS_KEYWORDS.include?(subval.strip) }
|
|
|
|
return ""
|
|
end
|
|
|
|
def sanitize_css_content(value)
|
|
# For now we only allow a single completely quoted string
|
|
return value if value =~ /^\'([^\']*)\'$/
|
|
return value if value =~ /^\"([^\"]*)\"$/
|
|
|
|
# or a valid img url
|
|
return value if value.match(Regexp.new("^#{URL_FUNCTION_REGEX}$"))
|
|
|
|
# or "none"
|
|
return value if value == "none"
|
|
|
|
return ""
|
|
end
|
|
|
|
# Font family names may be alphanumeric values with dashes
|
|
def sanitize_css_font(value)
|
|
value_stripped = strip_value(value)
|
|
if value_stripped.split(',').all? {|fontname| fontname.strip =~ /^(\'?[a-z0-9\- ]+\'?|\"?[a-z0-9\- ]+\"?)$/}
|
|
return value
|
|
else
|
|
return ""
|
|
end
|
|
end
|
|
|
|
# Remove !important and trailing spaces from values to simplify sanitization.
|
|
# In most cases, we return the original value after sanitizaiton, which
|
|
# restores the !important keyword.
|
|
# Note that this means !important is always allowed, regardless of whether it
|
|
# is included in ArchiveConfig.SUPPORTED_CSS_KEYWORDS.
|
|
def strip_value(value)
|
|
value.downcase.gsub(/(!important)/, "").strip
|
|
end
|
|
end
|