# Use css parser to break up style blocks require "css_parser" module CssCleaner include CssParser # constant regexps for css values ALPHA_REGEX = Regexp.new('[a-z\-]+') UNITS_REGEX = Regexp.new('deg|cm|em|ex|in|mm|pc|pt|px|s|%', Regexp::IGNORECASE) NUMBER_REGEX = Regexp.new('-?\.?\d{1,3}\.?\d{0,3}') NUMBER_WITH_UNIT_REGEX = Regexp.new("#{NUMBER_REGEX}\s*#{UNITS_REGEX}?\s*,?\s*") PAREN_NUMBER_REGEX = Regexp.new('\(\s*' + NUMBER_WITH_UNIT_REGEX.to_s + '+\s*\)') PREFIX_REGEX = Regexp.new('moz|ms|o|webkit') FUNCTION_NAME_REGEX = Regexp.new('scalex?y?|translatex?y?|skewx?y?|rotatex?y?|matrix', Regexp::IGNORECASE) TRANSFORM_FUNCTION_REGEX = Regexp.new("#{FUNCTION_NAME_REGEX}#{PAREN_NUMBER_REGEX}") SHAPE_NAME_REGEX = Regexp.new('rect', Regexp::IGNORECASE) SHAPE_FUNCTION_REGEX = Regexp.new("#{SHAPE_NAME_REGEX}#{PAREN_NUMBER_REGEX}") RGBA_REGEX = Regexp.new("rgba?" + PAREN_NUMBER_REGEX.to_s, Regexp::IGNORECASE) HSLA_REGEX = Regexp.new("hsla?" + PAREN_NUMBER_REGEX.to_s, Regexp::IGNORECASE) COLOR_REGEX = Regexp.new("#[0-9a-f]{3,6}|" + ALPHA_REGEX.to_s + "|" + RGBA_REGEX.to_s + "|" + HSLA_REGEX.to_s) COLOR_STOP_FUNCTION_REGEX = Regexp.new('color-stop\s*\(' + NUMBER_WITH_UNIT_REGEX.to_s + '\s*\,?\s*' + COLOR_REGEX.to_s + '\s*\)', Regexp::IGNORECASE) # list of filter functions can be found at https://developer.mozilla.org/en-US/docs/Web/CSS/filter#syntax FILTER_NAME_REGEX = Regexp.new("blur|brightness|contrast|grayscale|hue-rotate|invert|opacity|saturate|sepia", Regexp::IGNORECASE) FILTER_FUNCTION_REGEX = Regexp.new("#{FILTER_NAME_REGEX}#{PAREN_NUMBER_REGEX}") # drop-shadow can take multiple values, which are a mix of numbers and colors DROP_SHADOW_NAME_REGEX = Regexp.new("drop-shadow", Regexp::IGNORECASE) DROP_SHADOW_VALUE_REGEX = Regexp.new("\\(\\s*(#{NUMBER_WITH_UNIT_REGEX}|#{COLOR_REGEX}\\s*)+\\s*\\)") DROP_SHADOW_FUNCTION_REGEX = Regexp.new("#{DROP_SHADOW_NAME_REGEX}#{DROP_SHADOW_VALUE_REGEX}") # Custom properties (variables) are declared using --name: value and accessed # using property: var(--name). The var() function can be more complex, e.g., # var(--name, fallback value), but we're keeping our implementation simple. CUSTOM_PROPERTY_NAME_REGEXP = Regexp.new("\\-\\-[0-9a-z\\-_]+", Regexp::IGNORECASE) PAREN_CUSTOM_PROPERTY_REGEX = Regexp.new("\\(\\s*#{CUSTOM_PROPERTY_NAME_REGEXP}\\s*\\)", Regexp::IGNORECASE) VAR_FUNCTION_REGEX = Regexp.new("var#{PAREN_CUSTOM_PROPERTY_REGEX}", Regexp::IGNORECASE) # To allow the url() function, it is also necessary to include "url" in ArchiveConfig.SUPPORTED_CSS_KEYWORDS # from the ICANN list at http://www.icann.org/en/registries/top-level-domains.htm TOP_LEVEL_DOMAINS = %w(ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl sm sn so sr st su sv sy sz tc td tel tf tg th tj tk tl tm tn to tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws xn xxx ye yt za zm zw) DOMAIN_REGEX = Regexp.new('https?://\w[\w\-\.]+\.(' + TOP_LEVEL_DOMAINS.join('|') + ')') DOMAIN_OR_IMAGES_REGEX = Regexp.new('\/images|' + DOMAIN_REGEX.to_s) URI_REGEX = Regexp.new(DOMAIN_OR_IMAGES_REGEX.to_s + '/[\w\-\.\/]*[\w\-]\.(' + ArchiveConfig.SUPPORTED_EXTERNAL_URLS.join('|') + ')') URL_REGEX = Regexp.new(URI_REGEX.to_s + '|"' + URI_REGEX.to_s + '"|\'' + URI_REGEX.to_s + '\'') URL_FUNCTION_REGEX = Regexp.new('url\(\s*' + URL_REGEX.to_s + '\s*\)') VALUE_REGEX = Regexp.new("#{TRANSFORM_FUNCTION_REGEX}|#{URL_FUNCTION_REGEX}|#{COLOR_STOP_FUNCTION_REGEX}|#{COLOR_REGEX}|#{NUMBER_WITH_UNIT_REGEX}|#{ALPHA_REGEX}|#{SHAPE_FUNCTION_REGEX}|#{FILTER_FUNCTION_REGEX}|#{DROP_SHADOW_FUNCTION_REGEX}|#{VAR_FUNCTION_REGEX}") # For use in ActiveRecord models # We parse and clean the CSS line by line in order to provide more helpful error messages. # The prefix is used if you want to make sure a particular prefix appears on all the selectors in # this block of css, eg ".userstuff p" instead of just "p" def clean_css_code(css_code, options = {}) return "" if !css_code.match(/\w/) # only spaces of various kinds clean_css = "" parser = CssParser::Parser.new parser.add_block!(css_code) prefix = options[:prefix] || '' caller_check = options[:caller_check] errors.add(:base, :no_valid_css) if parser.to_s.blank? parser.each_rule_set do |rs| selectors = rs.selectors.map do |selector| if selector.match(/@font-face/i) errors.add(:base, :font_face) next end # remove whitespace and convert > entities back to the > direct child selector sel = selector.gsub(/\n/, "").gsub(">", ">").strip (prefix.blank? || sel.start_with?(prefix)) ? sel : "#{prefix} #{sel}" end clean_declarations = "" # Do not internationalize the , used as a join in these errors -- it's reflective of the comma used in the list of selectors, which does not change based on locale. rs.each_declaration do |property, value, is_important| if property.blank? || value.blank? errors.add(:base, :no_valid_css_for_selectors, selectors: rs.selectors.join(", ")) elsif sanitize_css_property(property).blank? # If it starts with --, assume the user was trying to define a custom property. if property.match(/\A--/) errors.add(:base, :invalid_custom_property_name, property: property, selectors: rs.selectors.join(", ")) else errors.add(:base, :banned_property, property: property) end elsif (cleanval = sanitize_css_declaration_value(property, value)).blank? errors.add(:base, :banned_value_for_property, property: property, selectors: rs.selectors.join(", "), value: value) elsif !caller_check || caller_check.call(rs, property, value) clean_declarations += " #{property}: #{cleanval}#{is_important ? ' !important' : ''};\n" end end if clean_declarations.blank? errors.add(:base, :no_rules_for_selectors, selectors: rs.selectors.join(", ")) else # everything looks ok, add it to the css clean_css += "#{selectors.join(",\n")} {\n" clean_css += clean_declarations clean_css += "}\n\n" end end return clean_css end def legal_property?(property) ArchiveConfig.SUPPORTED_CSS_PROPERTIES.include?(property) || property.match(/-(#{PREFIX_REGEX})-(#{ArchiveConfig.SUPPORTED_CSS_PROPERTIES.join('|')})/) end def legal_shorthand_property?(property) property.match(/#{ArchiveConfig.SUPPORTED_CSS_SHORTHAND_PROPERTIES.join('|')}/) end def custom_property?(property) property.match(/\A(#{CUSTOM_PROPERTY_NAME_REGEXP})\z/) end def sanitize_css_property(property) return property if legal_property?(property) || legal_shorthand_property?(property) || custom_property?(property) end # A declaration must match the format `property: value;` (space and semicolon # are optional in user input). # All properties must appear in ArchiveConfig.SUPPORTED_CSS_PROPERTIES or # ArchiveConfig.SUPPORTED_CSS_SHORTHAND_PROPERTIES, or that property and its # value will be removed and an error message will be given. # All values are sanitized. If any values in a declaration are invalid, the # value will be blanked out and an empty property returned, which will result # in an error. def sanitize_css_declaration_value(property, value) clean = "" if property == "font-family" # preserve the original capitalization clean = value if sanitize_css_font(value).present? elsif property == "content" # don't allow var() function clean = value.match(/\bvar\b/i) ? "" : sanitize_css_content(value) # The url() function can be used in the values for certain properties, # provided "url" is included in ArchiveConfig.SUPPORTED_CSS_KEYWORDS. If # those criteria are not met, we strip the value here. If they are met, the # value will undergo sanitization in tokenize_and_sanitize_css_value or # sanitize_css_value. elsif value.match(/\burl\b/i) && (ArchiveConfig.SUPPORTED_CSS_KEYWORDS.exclude?("url") || %w[background background-image border border-image list-style list-style-image].exclude?(property)) clean = "" elsif legal_shorthand_property?(property) || custom_property?(property) clean = tokenize_and_sanitize_css_value(value) elsif legal_property?(property) clean = sanitize_css_value(value) end clean.strip end # divide a css value into tokens and clean them individually def tokenize_and_sanitize_css_value(value) cleanval = "" scanner = StringScanner.new(value) # we scan until we find either a space, a comma, or an open parenthesis while scanner.exist?(/\s+|,|\(/) # we have some tokens left to break up in_paren = 0 token = scanner.scan_until(/\s+|,|\(/) if token.blank? || token == "," cleanval += token next end in_paren = 1 if token.match(/\($/) while in_paren > 0 # scan until closing paren or another opening paren nextpart = scanner.scan_until(/\(|\)/) if nextpart token += nextpart in_paren += 1 if token.match(/\($/) in_paren -= 1 if token.match(/\)$/) else # mismatched parens return "" end end # we now have a single token separator = token.match(/(\s|,)$/) || "" token.strip! token.chomp!(',') cleantoken = sanitize_css_token(token) return "" if cleantoken.blank? cleanval += cleantoken + separator.to_s end token = scanner.rest if token && !token.blank? cleantoken = sanitize_css_token(token) return "" if cleantoken.blank? cleanval += cleantoken end return cleanval end def sanitize_css_token(token) if token.match?(/gradient/) sanitize_css_gradient(token) else sanitize_css_value(token) end end # sanitize a CSS gradient # background:-webkit-gradient( linear, left bottom, left top, color-stop(0, rgb(82,82,82)), color-stop(1, rgb(125,124,125))); # -moz-linear-gradient(bottom, rgba(120,120,120,1) 5%, rgba(94,94,94,1) 50%, rgba(108,108,108,1) 55%, rgba(137,137,137,1) 100%); def sanitize_css_gradient(value) if value.match(/^([a-z\-]+)\((.*)\)/) function = $1 interior = $2 cleaned_interior = tokenize_and_sanitize_css_value(interior) if function.match(/gradient/) && !cleaned_interior.blank? return "#{function}(#{cleaned_interior})" end end return "" end # All values must be either # - in ArchiveConfig.SUPPORTED_CSS_KEYWORDS # - URLs of the format url(http://url/) # - rgba(), hsla(), hex, or named colors # - numeric values # - transform, shape, filter, drop shadow, or variable functions # Comma-separated lists of these values are also allowed. def sanitize_css_value(value) value_stripped = strip_value(value) # If it's a comma-separated set of valid values, it's fine. However, we need # to downcase any var() functions to match the css_parser gem's downcasing # of property names. if value_stripped.match?(/^(#{VALUE_REGEX},?\s*)+$/i) return value unless value.match?(/#{VAR_FUNCTION_REGEX}/) return value.gsub(/#{VAR_FUNCTION_REGEX}/, &:downcase) end # If the value is explicitly in our list of supported keywords, it's fine. # However, note that !important is always allowed (refer to the comments on # strip_value(value) and ArchiveConfig.SUPPORTED_CSS_KEYWORDS for more), and # that the url() function is allowed by the VALUE_REGEX above. Excluding # url() from SUPPORTED_CSS_KEYWORDS only strips it because of the check in # sanitize_css_declaration_value. return value if value_stripped.split(",").all? { |subval| ArchiveConfig.SUPPORTED_CSS_KEYWORDS.include?(subval.strip) } return "" end def sanitize_css_content(value) # For now we only allow a single completely quoted string return value if value =~ /^\'([^\']*)\'$/ return value if value =~ /^\"([^\"]*)\"$/ # or a valid img url return value if value.match(Regexp.new("^#{URL_FUNCTION_REGEX}$")) # or "none" return value if value == "none" return "" end # Font family names may be alphanumeric values with dashes def sanitize_css_font(value) value_stripped = strip_value(value) if value_stripped.split(',').all? {|fontname| fontname.strip =~ /^(\'?[a-z0-9\- ]+\'?|\"?[a-z0-9\- ]+\"?)$/} return value else return "" end end # Remove !important and trailing spaces from values to simplify sanitization. # In most cases, we return the original value after sanitizaiton, which # restores the !important keyword. # Note that this means !important is always allowed, regardless of whether it # is included in ArchiveConfig.SUPPORTED_CSS_KEYWORDS. def strip_value(value) value.downcase.gsub(/(!important)/, "").strip end end