# frozen_string_literal: true require "addressable/uri" require "cgi" module OtwSanitize # Creates a Sanitize transformer to sanitize embedded media class EmbedSanitizer ALLOWLIST_REGEXES = { "4shared": %r{^4shared\.com/web/embed}, audiocom: %r{^audio\.com/embed/audio/}, archiveorg: %r{^archive\.org/embed/}, bilibili: %r{^(player\.)?bilibili\.com/}, criticalcommons: %r{^criticalcommons\.org/}, eighttracks: %r{^8tracks\.com/}, google: %r{^google\.com/}, podfic: %r{^podfic\.com/}, soundcloud: %r{^(w\.)?soundcloud\.com/}, spotify: %r{^(open\.)?spotify\.com/}, viddersnet: %r{^vidders\.net/}, viddertube: %r{^viddertube\.com/}, vimeo: %r{^(player\.)?vimeo\.com/}, youtube: %r{^youtube(-nocookie)?\.com/} }.freeze ALLOWS_FLASHVARS = %i[ criticalcommons eighttracks google podfic soundcloud spotify viddersnet ].freeze SUPPORTS_HTTPS = %i[ 4shared audiocom archiveorg bilibili eighttracks podfic soundcloud spotify viddersnet viddertube vimeo youtube ].freeze # Creates a callable transformer for the sanitizer to use def self.transformer lambda do |env| # Don't continue if this node is already safelisted. return if env[:is_allowlisted] new(env[:node]).sanitized_node end end attr_reader :node # Takes a Nokogiri node def initialize(node) @node = node end def sanitized_node return unless embed_node? return unless source_url && source ensure_https if parent_name == "object" sanitize_object else sanitize_embed end end def node_name node.name.to_s.downcase end delegate :parent, to: :node def parent_name parent.name.to_s.downcase if parent end # Since the transformer receives the deepest nodes first, we look for a # element whose parent is an , or an embed or iframe def embed_node? (node_name == "param" && parent_name == "object") || %w[embed iframe].include?(node_name) end # Compare the url to our list of allowlisted sources # and return the appropriate source symbol def source return @source if @source ALLOWLIST_REGEXES.each_pair do |name, reg| if source_url =~ reg @source = name break end end @source end # Get the url of the thing we're embedding and standardize it def source_url return @source_url if @source_url if node_name == "param" # Quick XPath search to find the node that contains the video URL. return unless (movie_node = node.parent.search('param[@name="movie"]')[0]) url = movie_node["value"] else url = node["src"] end @source_url = standardize_url(url) end def standardize_url(url) # strip off optional protocol and www protocol_regex = %r{^(?:https?:)?//(?:www\.)?}i # normalize the url url = url&.gsub(protocol_regex, "") begin Addressable::URI.parse(url).normalize.to_s rescue StandardError nil end end # For sites that support https, ensure we use a secure embed def ensure_https return unless supports_https? && node["src"].present? node["src"] = node["src"].gsub("http:", "https:") return unless allows_flashvars? && node["flashvars"].present? node["flashvars"] = node["flashvars"].gsub("http:", "https:") node["flashvars"] = node["flashvars"].gsub("http%3A", "https%3A") end # We're now certain that this is an embed from a trusted source, but we # still need to run it through a special Sanitize step to ensure # that no unwanted elements or attributes that don't belong in # a video embed can sneak in. def sanitize_object Sanitize.clean_node!( parent, elements: %w[embed object param], attributes: { "embed" => %w[allowfullscreen height src type width], "object" => %w[height width], "param" => %w[name value] } ) disable_scripts(parent) { node_allowlist: [node, parent] } end def sanitize_embed Sanitize.clean_node!( node, elements: %w[embed iframe], attributes: { "embed" => %w[ allowfullscreen height src type width ] + optional_embed_attributes, "iframe" => %w[ allowfullscreen frameborder height src title class type width ] } ) if node_name == "embed" disable_scripts(node) node["flashvars"] = "" unless allows_flashvars? end { node_allowlist: [node] } end # disable script access and networking def disable_scripts(embed_node) embed_node["allowscriptaccess"] = "never" embed_node["allownetworking"] = "internal" embed_node.search("param").each do |param_node| param_node.unlink if param_node[:name].casecmp?("allowscriptaccess") || param_node[:name].casecmp?("allownetworking") end end def optional_embed_attributes if allows_flashvars? %w[wmode flashvars] else [] end end def allows_flashvars? ALLOWS_FLASHVARS.include?(source) end def supports_https? SUPPORTS_HTTPS.include?(source) end end end