# frozen_string_literal: true # Creates a Sanitize transformer to sanitize audio and video tags module OtwSanitize class MediaSanitizer # Attribute allowlists AUDIO_ATTRIBUTES = %w[ class controls crossorigin dir loop muted preload src title ].freeze VIDEO_ATTRIBUTES = %w[ class controls crossorigin dir height loop muted playsinline poster preload src title width ].freeze SOURCE_ATTRIBUTES = %w[src type].freeze TRACK_ATTRIBUTES = %w[default kind label src srclang].freeze ALLOWLIST_CONFIG = { elements: %w[ audio video source track ] + Sanitize::Config::ARCHIVE[:elements], attributes: { "audio" => AUDIO_ATTRIBUTES, "video" => VIDEO_ATTRIBUTES, "source" => SOURCE_ATTRIBUTES, "track" => TRACK_ATTRIBUTES }, add_attributes: { "audio" => { "controls" => "controls", "crossorigin" => "anonymous", "preload" => "metadata" }, "video" => { "controls" => "controls", "playsinline" => "playsinline", "crossorigin" => "anonymous", "preload" => "metadata" } }, protocols: { "audio" => { "src" => %w[http https] }, "video" => { "poster" => %w[http https], "src" => %w[http https] }, "source" => { "src" => %w[http https] }, "track" => { "src" => %w[http https] } } }.freeze # Creates a callable transformer for the sanitizer to use def self.transformer lambda do |env| # Don't continue if this node is already safelisted. return if env[:is_allowlisted] new(env[:node]).sanitized_node end end attr_reader :node # Takes a Nokogiri node def initialize(node) @node = node end # Skip if it's not media or if we don't want to allowlist it def sanitized_node return unless media_node? return if banned_source? config = Sanitize::Config.merge(Sanitize::Config::ARCHIVE, ALLOWLIST_CONFIG) Sanitize.clean_node!(node, config) tidy_boolean_attributes(node) { node_allowlist: [node] } end def node_name node.name.to_s.downcase end def media_node? %w[audio video source track].include?(node_name) end def source_url node["src"] || "" end def source_host url = source_url return nil if url.blank? # Just in case we're missing a protocol url = "https://" + url unless url =~ /http/ Addressable::URI.parse(url).normalize.host end def banned_source? return unless source_host ArchiveConfig.BANNED_MULTIMEDIA_SRCS.any? do |blocked| source_host.match(blocked) end end # Sanitize outputs boolean attributes as attribute="". While this works, # attribute="attribute" is more consistent with the way we handle the # boolean attributes we automatically add (e.g. controls="controls"). def tidy_boolean_attributes(node) node["default"] = "default" if node["default"] node["loop"] = "loop" if node["loop"] node["muted"] = "muted" if node["muted"] end end end