otwarchive-symphonyarchive/lib/otw_sanitize/media_sanitizer.rb
2026-03-11 22:22:11 +00:00

126 lines
3.2 KiB
Ruby

# frozen_string_literal: true
# Creates a Sanitize transformer to sanitize audio and video tags
module OtwSanitize
class MediaSanitizer
# Attribute allowlists
AUDIO_ATTRIBUTES = %w[
class controls crossorigin dir
loop muted preload src title
].freeze
VIDEO_ATTRIBUTES = %w[
class controls crossorigin dir height loop
muted playsinline poster preload src title width
].freeze
SOURCE_ATTRIBUTES = %w[src type].freeze
TRACK_ATTRIBUTES = %w[default kind label src srclang].freeze
ALLOWLIST_CONFIG = {
elements: %w[
audio video source track
] + Sanitize::Config::ARCHIVE[:elements],
attributes: {
"audio" => AUDIO_ATTRIBUTES,
"video" => VIDEO_ATTRIBUTES,
"source" => SOURCE_ATTRIBUTES,
"track" => TRACK_ATTRIBUTES
},
add_attributes: {
"audio" => {
"controls" => "controls",
"crossorigin" => "anonymous",
"preload" => "metadata"
},
"video" => {
"controls" => "controls",
"playsinline" => "playsinline",
"crossorigin" => "anonymous",
"preload" => "metadata"
}
},
protocols: {
"audio" => {
"src" => %w[http https]
},
"video" => {
"poster" => %w[http https],
"src" => %w[http https]
},
"source" => {
"src" => %w[http https]
},
"track" => {
"src" => %w[http https]
}
}
}.freeze
# Creates a callable transformer for the sanitizer to use
def self.transformer
lambda do |env|
# Don't continue if this node is already safelisted.
return if env[:is_allowlisted]
new(env[:node]).sanitized_node
end
end
attr_reader :node
# Takes a Nokogiri node
def initialize(node)
@node = node
end
# Skip if it's not media or if we don't want to allowlist it
def sanitized_node
return unless media_node?
return if banned_source?
config = Sanitize::Config.merge(Sanitize::Config::ARCHIVE, ALLOWLIST_CONFIG)
Sanitize.clean_node!(node, config)
tidy_boolean_attributes(node)
{ node_allowlist: [node] }
end
def node_name
node.name.to_s.downcase
end
def media_node?
%w[audio video source track].include?(node_name)
end
def source_url
node["src"] || ""
end
def source_host
url = source_url
return nil if url.blank?
# Just in case we're missing a protocol
url = "https://" + url unless url =~ /http/
Addressable::URI.parse(url).normalize.host
end
def banned_source?
return unless source_host
ArchiveConfig.BANNED_MULTIMEDIA_SRCS.any? do |blocked|
source_host.match(blocked)
end
end
# Sanitize outputs boolean attributes as attribute="". While this works,
# attribute="attribute" is more consistent with the way we handle the
# boolean attributes we automatically add (e.g. controls="controls").
def tidy_boolean_attributes(node)
node["default"] = "default" if node["default"]
node["loop"] = "loop" if node["loop"]
node["muted"] = "muted" if node["muted"]
end
end
end