require "spec_helper" require "nokogiri" describe HtmlCleaner do include HtmlCleaner def one_cell_table(content) "

#{content}

" end describe "sanitize_value" do ArchiveConfig.FIELDS_ALLOWING_MEDIA_EMBEDS.each do |field| context "#{field} is configured to allow media embeds" do %w[youtube.com youtube-nocookie.com vimeo.com player.vimeo.com vidders.net criticalcommons.org google.com podfic.com archive.org open.spotify.com spotify.com 8tracks.com w.soundcloud.com soundcloud.com viddertube.com bilibili.com player.bilibili.com 4shared.com/web/embed audio.com/embed/audio].each do |source| it "keeps embeds from #{source}" do html = '' result = sanitize_value(field, html) expect(result).to include(html) end end %w[youtube.com youtube-nocookie.com vimeo.com player.vimeo.com archive.org 8tracks.com podfic.com open.spotify.com spotify.com w.soundcloud.com soundcloud.com vidders.net viddertube.com bilibili.com player.bilibili.com 4shared.com/web/embed audio.com/embed/audio].each do |source| it "converts src to https for #{source}" do html = '' result = sanitize_value(field, html) expect(result).to match('https:') end end %w[vidders.net].each do |source| it "converts flashvars to https for #{source}" do html = '

' result = sanitize_value(field, html) expect(result).to match('flashvars=.*https:') end end it "keeps google player embeds without closing tag" do # HTML5 disallows , according to https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/embed#technical_summary html1 = '

' html2 = "#{html1}" result = sanitize_value(field, html2) expect(result).to eq(html1) end it "strips embeds with unknown source" do html = '

' result = sanitize_value(field, html) expect(result).to be_empty end it "strips archive.org iframe if the src is not the embed directory" do html = '' result = sanitize_value(field, html) expect(result).to be_empty end %w[criticalcommons.org].each do |source| it "doesn't convert src to https for #{source}" do html = '' result = sanitize_value(field, html) expect(result).not_to match('https:') end end it "allows video tags" do html = '

' expect(sanitize_value(field, html)).to eq(html) end it "allows audio tags" do html = '

' expect(sanitize_value(field, html)).to eq(html) end end end context "Strip out tags not allowed in text fields other than content" do [:endnotes, :notes, :summary].each do |field| it "strips iframes" do value = '' result = sanitize_value(field, value) expect(result).to eq("") end it "strips video tags" do value = "

" result = sanitize_value(field, value) expect(result).to eq("") end end end ArchiveConfig.FIELDS_ALLOWING_CSS.each do |field| context "#{field} field allows class attribute for CSS" do context "class has one value" do it "keeps values containing only letters, numbers, and hyphens" do result = sanitize_value(field, '

foobar

') doc = Nokogiri::HTML5.fragment(result) expect(doc.xpath("./p[@class='f-5']/node()").to_s.strip).to eq("foobar") end it "strips values starting with a number" do result = sanitize_value(field, '

foobar

') expect(result).not_to match(/8ball/) end it "strips values starting with a hyphen" do result = sanitize_value(field, '

foobar

') expect(result).not_to match(/-dash/) end it "strips values with special characters" do result = sanitize_value(field, '

foobar

') expect(result).not_to match(/foo@bar/) end end context "class attribute has multiple values" do it "keeps all valid values" do result = sanitize_value(field, '

foobar

') doc = Nokogiri::HTML5.fragment(result) expect(doc.xpath("./p[contains(@class, 'foo bar')]/node()").to_s.strip).to eq("foobar") end it "strips values starting with numbers" do result = sanitize_value(field, '

foobar

') expect(result).not_to match(/8ball/) expect(result).to match(/magic/) end it "strips values starting with hypens" do result = sanitize_value(field, '

foobar

') expect(result).not_to match(/-dash/) expect(result).to match(/rainbow/) end end end end [:comment_content, :bookmarker_notes, :summary].each do |field| context "#{field} field does not allow class attribute" do it "strips attribute even if value is valid" do result = sanitize_value(field, '

foobar

') expect(result).not_to match(/f-5/) expect(result).not_to match(/class/) end end end [:content, :endnotes, :notes, :summary].each do |field| context "Sanitize #{field} field" do it "keeps html" do value = "hello

world

" result = sanitize_value(field, value) doc = Nokogiri::HTML5.fragment(result) expect(doc.xpath(".//em/node()").to_s.strip).to eq("hello") expect(doc.xpath(".//blockquote/node()").to_s.strip).to eq("

world

") end it "should keep valid unicode chars as is" do result = sanitize_value(field, "„‚nörmäl’—téxt‘“") expect(result).to match(/„‚nörmäl’—téxt‘“/) end it "allows RTL content in p" do html = '

This is RTL content

' result = sanitize_value(field, html) expect(result).to eq(html) end it "allows RTL content in div" do html = '

This is RTL content

' result = sanitize_value(field, html) expect(result).to eq('

This is RTL content

') end it "should not allow iframes with unknown source" do html = '' result = sanitize_value(field, html) expect(result).to be_empty end [ "'';!--\"=&{()}", '' ].each do |value| it "should strip xss tags: #{value}" do result = sanitize_value(field, value) expect(result).not_to match(/xss/i) end end [ "", '<', "", '', '' ].each do |value| it "should strip script tags: #{value}" do result = sanitize_value(field, value) expect(result).not_to match(/script/i) expect(result).not_to match(/ha.ckers.org/) end end [ "\\\";alert('XSS');//", "xss:expr/*blah*/ession(alert('XSS'))", "xss:expression(alert('XSS'))" ].each do |value| it "should keep text: #{value}" do result = sanitize_value(field, value) expect(result).to match(/alert\('XSS'\)/) end end it "should strip iframe tags" do value = "