otwarchive-symphonyarchive/spec/models/story_parser_spec.rb
2026-03-11 22:22:11 +00:00

350 lines
14 KiB
Ruby
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

require 'spec_helper'
require 'webmock'
describe StoryParser do
# Temporarily make the methods we want to test public
before(:all) do
class StoryParser
public :get_source_if_known, :check_for_previous_import, :parse_common, :parse_author
end
end
after(:all) do
class StoryParser
protected :get_source_if_known, :check_for_previous_import, :parse_common, :parse_author
end
end
before(:each) do
@sp = StoryParser.new
end
describe "get_source_if_known:" do
describe "the SOURCE_FFNET pattern" do
it "should match http://fanfiction.net" do
url = "http://fanfiction.net"
expect(@sp.get_source_if_known(StoryParser::CHAPTERED_STORY_LOCATIONS, url)).to eq("ffnet")
end
it "should match fanfiction.net" do
url = "fanfiction.net"
expect(@sp.get_source_if_known(StoryParser::CHAPTERED_STORY_LOCATIONS, url)).to eq("ffnet")
end
it "should match http://www.fanfiction.net" do
url = "http://www.fanfiction.net"
expect(@sp.get_source_if_known(StoryParser::CHAPTERED_STORY_LOCATIONS, url)).to eq("ffnet")
end
it "should match www.fanfiction.net" do
url = "www.fanfiction.net"
expect(@sp.get_source_if_known(StoryParser::CHAPTERED_STORY_LOCATIONS, url)).to eq("ffnet")
end
it "should not match http://adultfanfiction.net" do
url = "http://adultfanfiction.net"
expect(@sp.get_source_if_known(StoryParser::CHAPTERED_STORY_LOCATIONS, url)).to be_nil
end
it "should not match adultfanfiction.net" do
url = "adultfanfiction.net"
expect(@sp.get_source_if_known(StoryParser::CHAPTERED_STORY_LOCATIONS, url)).to be_nil
end
it "should not match http://www.adultfanfiction.net" do
url = "http://www.adultfanfiction.net"
expect(@sp.get_source_if_known(StoryParser::CHAPTERED_STORY_LOCATIONS, url)).to be_nil
end
it "should not match www.adultfanfiction.net" do
url = "www.adultfanfiction.net"
expect(@sp.get_source_if_known(StoryParser::CHAPTERED_STORY_LOCATIONS, url)).to be_nil
end
end
describe "the SOURCE_LJ pattern" do
# SOURCE_LJ = '((live|dead|insane)?journal(fen)?\.com)|dreamwidth\.org'
it "should match a regular domain on livejournal" do
url = "http://mydomain.livejournal.com"
expect(@sp.get_source_if_known(StoryParser::KNOWN_STORY_LOCATIONS, url)).to eq("lj")
end
it "should match a domain with underscores within on livejournal" do
url = "http://my_domain.livejournal.com"
expect(@sp.get_source_if_known(StoryParser::KNOWN_STORY_LOCATIONS, url)).to eq("lj")
end
it "should match a folder style link to an individual user on livejournal" do
url = "http://www.livejournal.com/users/_underscore"
expect(@sp.get_source_if_known(StoryParser::KNOWN_STORY_LOCATIONS, url)).to eq("lj")
end
it "should match a folder style link to a community on livejournal" do
url = "http://www.livejournal.com/community/underscore_"
expect(@sp.get_source_if_known(StoryParser::KNOWN_STORY_LOCATIONS, url)).to eq("lj")
end
it "should match a domain on dreamwidth" do
url = "http://mydomain.dreamwidth.org"
expect(@sp.get_source_if_known(StoryParser::KNOWN_STORY_LOCATIONS, url)).to eq("lj")
end
it "should match a domain on deadjournal" do
url = "http://mydomain.deadjournal.com"
expect(@sp.get_source_if_known(StoryParser::KNOWN_STORY_LOCATIONS, url)).to eq("lj")
end
it "should match a domain on insanejournal" do
url = "http://mydomain.insanejournal.com"
expect(@sp.get_source_if_known(StoryParser::KNOWN_STORY_LOCATIONS, url)).to eq("lj")
end
it "should match a folder style link to an individual user on journalfen" do
url = "http://www.journalfen.net/users/username"
expect(@sp.get_source_if_known(StoryParser::KNOWN_STORY_LOCATIONS, url)).to eq("lj")
end
end
# TODO: KNOWN_STORY_PARSERS
end
describe "check_for_previous_import" do
let(:location_with_www) { "http://www.testme.org/welcome_to_test_vale.html" }
let(:location_no_www) { "http://testme.org/welcome_to_test_vale.html" }
let(:location_partial_match) { "http://testme.org/welcome_to_test_vale/12345" }
it "should recognise previously imported www. works" do
@work = FactoryBot.create(:work, imported_from_url: location_with_www)
expect { @sp.check_for_previous_import(location_no_www) }.to raise_exception(StoryParser::Error)
end
it "should recognise previously imported non-www. works" do
@work = FactoryBot.create(:work, imported_from_url: location_no_www)
expect { @sp.check_for_previous_import(location_with_www) }.to raise_exception(StoryParser::Error)
end
it "should not perform a partial match on work import locations" do
@work = create(:work, imported_from_url: location_partial_match)
expect { @sp.check_for_previous_import("http://testme.org/welcome_to_test_vale/123") }.to_not raise_exception
end
end
context "#download_and_parse_chapters_into_story" do
it "should set the work revision date to the date of the last chapter" do
# Let the test get at external sites, but stub out anything containing "url1" and "url2"
WebMock.allow_net_connect!
WebMock.stub_request(:any, /url1/).
to_return(status: 200, body: "Date: 2001-01-10 13:45\nstubbed response", headers: {})
WebMock.stub_request(:any, /url2/).
to_return(status: 200, body: "Date: 2001-01-22 12:56\nstubbed response", headers: {})
storyparser_user = FactoryBot.create(:user)
urls = %w(http://url1 http://url2)
work = @sp.download_and_parse_chapters_into_story(urls, { pseuds: [storyparser_user.default_pseud], do_not_set_current_author: false })
work.save
actual_date = work.revised_at.to_date
expected_date = Date.new(2001, 1, 22)
expect(actual_date).to eq(expected_date)
end
end
describe "#download_text" do
before do
WebMock.stub_request(:get, "http://example.org/foo")
.to_return(status: 200, body: "the response of the redirect target", headers: {})
end
it "follows relative redirects" do
input_url = "http://example.org/bar"
WebMock.stub_request(:get, input_url)
.to_return(status: 302, headers: { "Location" => "/foo" })
expect(@sp.send(:download_text, input_url)).to eq("the response of the redirect target")
end
it "follows absolute redirects" do
input_url = "http://foo.com/"
WebMock.stub_request(:get, input_url)
.to_return(status: 302, headers: { "Location" => "http://example.org/foo" })
expect(@sp.send(:download_text, input_url)).to eq("the response of the redirect target")
end
end
describe "#parse_common" do
it "converts relative to absolute links" do
# This one doesn't work because the sanitizer is converting the & to &
# ['http://foo.com/bar.html', 'search.php?here=is&a=query'] => 'http://foo.com/search.php?here=is&a=query',
{
['http://foo.com/bar.html', 'thisdir.html'] => 'http://foo.com/thisdir.html',
['http://foo.com/bar.html?hello=foo', 'thisdir.html'] => 'http://foo.com/thisdir.html',
['http://foo.com/bar.html', './thisdir.html'] => 'http://foo.com/thisdir.html',
['http://foo.com/bar.html', 'img.jpg'] => 'http://foo.com/img.jpg',
['http://foo.com/bat/bar.html', '../updir.html'] => 'http://foo.com/updir.html',
['http://foo.com/bar.html', 'http://bar.com/foo.html'] => 'http://bar.com/foo.html',
['http://foo.com/bar.html', 'search.php?hereis=aquery'] => 'http://foo.com/search.php?hereis=aquery',
}.each_pair do |input, output|
location, href = input
story_in = '<html><body><p>here is <a href="' + href + '">a link</a>.</p></body></html>'
story_out = '<p>here is <a href="' + output + '" rel="nofollow">a link</a>.</p>'
results = @sp.parse_common(story_in, location)
expect(results[:chapter_attributes][:content]).to include(story_out)
end
end
it "does NOT convert raw anchor links to absolute links" do
location = "http://external_site"
story_in = "<html><body><p><a href=#local>local href</p></body></html>"
result = @sp.parse_common(story_in, location)
expect(result[:chapter_attributes][:content]).not_to include(location)
expect(result[:chapter_attributes][:content]).to include("#local")
end
end
describe "#parse_author" do
it "returns an external author name when a name and email are provided" do
results = @sp.parse_author("", "Author Name", "author@example.com")
expect(results.name).to eq("Author Name")
expect(results.external_author.email).to eq("author@example.com")
end
it "raises an exception when the external author name is not provided" do
expect do
@sp.parse_author("", nil, "author@example.com")
end.to raise_exception(StoryParser::Error, "No author name specified")
end
it "raises an exception when the external author email is not provided" do
expect do
@sp.parse_author("", "Author Name", nil)
end.to raise_exception(StoryParser::Error, "No author email specified")
end
it "raises an exception when neither the external author name nor email is provided" do
expect do
@sp.parse_author("", nil, nil)
end.to raise_exception(StoryParser::Error, "No author name specified\nNo author email specified")
end
it "gives the same external author object for the same email" do
res1 = @sp.parse_author("", "Author Name", "author@example.com")
res2 = @sp.parse_author("", "Author Name Second", "author@example.com")
res3 = @sp.parse_author("", "Author!! Name!!", "author@example.com")
expect(res2.external_author.id).to eq(res1.external_author.id)
expect(res3.external_author.id).to eq(res1.external_author.id)
expect(res1.name).to eq("Author Name")
expect(res2.name).to eq("Author Name Second")
end
it "ignores the external author name when it is invalid" do
results = @sp.parse_author("", "!!!!", "author@example.com")
expect(results.name).to eq("author@example.com")
expect(results.external_author.email).to eq("author@example.com")
end
it "ignores invalid letters in the external author name" do
results = @sp.parse_author("", "Author!! Name!!", "author@example.com")
expect(results.name).to eq("Author Name")
expect(results.external_author.email).to eq("author@example.com")
end
it "raises an exception when the external author email is invalid" do
expect do
@sp.parse_author("", "Author Name", "not_email")
end.to raise_exception(StoryParser::Error, "Email should look like an email address.")
end
it "raises an exception when the external author name and email are invalid" do
expect do
@sp.parse_author("", "!!!!", "not_email")
end.to raise_exception(StoryParser::Error, "Email should look like an email address.")
end
it "raises an exception when the external author name is blank and email is invalid" do
expect do
@sp.parse_author("", "", "not_email")
end.to raise_exception(StoryParser::Error, "No author name specified\nEmail should look like an email address.")
end
it "raises an exception when the external author name is invalid and email is blank" do
expect do
@sp.parse_author("", "!!!!", "")
end.to raise_exception(StoryParser::Error, "No author email specified")
end
end
# Let the test get at external sites, but stub out anything containing certain keywords
def mock_external
curly_quotes = "String with non-ASCII “Curly quotes” and apostrophes"
body = <<~STUB
Title: #{curly_quotes}
Summary: #{curly_quotes}
Fandom: #{curly_quotes}
Rating: #{curly_quotes}
Warnings: #{curly_quotes}
Characters: #{curly_quotes}
Pairing: Includes a character that broke the importer
Category: #{curly_quotes}
Tags: #{curly_quotes}
Author's notes: #{curly_quotes}
stubbed response
STUB
binary_body = body.clone.force_encoding("ASCII-8BIT")
WebMock.allow_net_connect!
WebMock.stub_request(:any, /ascii-8bit/).
to_return(status: 200,
body: binary_body,
headers: {})
WebMock.stub_request(:any, /utf-8/).
to_return(status: 200,
body: body,
headers: {})
WebMock.stub_request(:any, /win-1252/).
to_return(status: 200,
body: body.encode("Windows-1252"),
headers: {})
WebMock.stub_request(:any, /non-sgml-character-number-3/).
to_return(status: 200,
body: "<body>\0When I get out of here</body>")
end
describe "Import" do
before do
mock_external
@user = create(:user)
end
after do
WebMock.reset!
end
it "does not throw an exception with non-ASCII characters in metadata fields" do
urls = %w[http://ascii-8bit http://utf-8 http://win-1252]
urls.each do |url|
expect do
@sp.download_and_parse_story(url, pseuds: [@user.default_pseud], do_not_set_current_author: false)
end.not_to raise_exception
end
end
it "ignores string terminators (AO3-2251)" do
story = @sp.download_and_parse_story("http://non-sgml-character-number-3", pseuds: [@user.default_pseud])
expect(story.chapters[0].content).to include("When I get out of here")
end
end
end