otwarchive-symphonyarchive/spec/models/story_parser_spec.rb

351 lines
14 KiB
Ruby
Raw Normal View History

2026-03-11 22:22:11 +00:00
require 'spec_helper'
require 'webmock'
describe StoryParser do
# Temporarily make the methods we want to test public
before(:all) do
class StoryParser
public :get_source_if_known, :check_for_previous_import, :parse_common, :parse_author
end
end
after(:all) do
class StoryParser
protected :get_source_if_known, :check_for_previous_import, :parse_common, :parse_author
end
end
before(:each) do
@sp = StoryParser.new
end
describe "get_source_if_known:" do
describe "the SOURCE_FFNET pattern" do
it "should match http://fanfiction.net" do
url = "http://fanfiction.net"
expect(@sp.get_source_if_known(StoryParser::CHAPTERED_STORY_LOCATIONS, url)).to eq("ffnet")
end
it "should match fanfiction.net" do
url = "fanfiction.net"
expect(@sp.get_source_if_known(StoryParser::CHAPTERED_STORY_LOCATIONS, url)).to eq("ffnet")
end
it "should match http://www.fanfiction.net" do
url = "http://www.fanfiction.net"
expect(@sp.get_source_if_known(StoryParser::CHAPTERED_STORY_LOCATIONS, url)).to eq("ffnet")
end
it "should match www.fanfiction.net" do
url = "www.fanfiction.net"
expect(@sp.get_source_if_known(StoryParser::CHAPTERED_STORY_LOCATIONS, url)).to eq("ffnet")
end
it "should not match http://adultfanfiction.net" do
url = "http://adultfanfiction.net"
expect(@sp.get_source_if_known(StoryParser::CHAPTERED_STORY_LOCATIONS, url)).to be_nil
end
it "should not match adultfanfiction.net" do
url = "adultfanfiction.net"
expect(@sp.get_source_if_known(StoryParser::CHAPTERED_STORY_LOCATIONS, url)).to be_nil
end
it "should not match http://www.adultfanfiction.net" do
url = "http://www.adultfanfiction.net"
expect(@sp.get_source_if_known(StoryParser::CHAPTERED_STORY_LOCATIONS, url)).to be_nil
end
it "should not match www.adultfanfiction.net" do
url = "www.adultfanfiction.net"
expect(@sp.get_source_if_known(StoryParser::CHAPTERED_STORY_LOCATIONS, url)).to be_nil
end
end
describe "the SOURCE_LJ pattern" do
# SOURCE_LJ = '((live|dead|insane)?journal(fen)?\.com)|dreamwidth\.org'
it "should match a regular domain on livejournal" do
url = "http://mydomain.livejournal.com"
expect(@sp.get_source_if_known(StoryParser::KNOWN_STORY_LOCATIONS, url)).to eq("lj")
end
it "should match a domain with underscores within on livejournal" do
url = "http://my_domain.livejournal.com"
expect(@sp.get_source_if_known(StoryParser::KNOWN_STORY_LOCATIONS, url)).to eq("lj")
end
it "should match a folder style link to an individual user on livejournal" do
url = "http://www.livejournal.com/users/_underscore"
expect(@sp.get_source_if_known(StoryParser::KNOWN_STORY_LOCATIONS, url)).to eq("lj")
end
it "should match a folder style link to a community on livejournal" do
url = "http://www.livejournal.com/community/underscore_"
expect(@sp.get_source_if_known(StoryParser::KNOWN_STORY_LOCATIONS, url)).to eq("lj")
end
it "should match a domain on dreamwidth" do
url = "http://mydomain.dreamwidth.org"
expect(@sp.get_source_if_known(StoryParser::KNOWN_STORY_LOCATIONS, url)).to eq("lj")
end
it "should match a domain on deadjournal" do
url = "http://mydomain.deadjournal.com"
expect(@sp.get_source_if_known(StoryParser::KNOWN_STORY_LOCATIONS, url)).to eq("lj")
end
it "should match a domain on insanejournal" do
url = "http://mydomain.insanejournal.com"
expect(@sp.get_source_if_known(StoryParser::KNOWN_STORY_LOCATIONS, url)).to eq("lj")
end
it "should match a folder style link to an individual user on journalfen" do
url = "http://www.journalfen.net/users/username"
expect(@sp.get_source_if_known(StoryParser::KNOWN_STORY_LOCATIONS, url)).to eq("lj")
end
end
# TODO: KNOWN_STORY_PARSERS
end
describe "check_for_previous_import" do
let(:location_with_www) { "http://www.testme.org/welcome_to_test_vale.html" }
let(:location_no_www) { "http://testme.org/welcome_to_test_vale.html" }
let(:location_partial_match) { "http://testme.org/welcome_to_test_vale/12345" }
it "should recognise previously imported www. works" do
@work = FactoryBot.create(:work, imported_from_url: location_with_www)
expect { @sp.check_for_previous_import(location_no_www) }.to raise_exception(StoryParser::Error)
end
it "should recognise previously imported non-www. works" do
@work = FactoryBot.create(:work, imported_from_url: location_no_www)
expect { @sp.check_for_previous_import(location_with_www) }.to raise_exception(StoryParser::Error)
end
it "should not perform a partial match on work import locations" do
@work = create(:work, imported_from_url: location_partial_match)
expect { @sp.check_for_previous_import("http://testme.org/welcome_to_test_vale/123") }.to_not raise_exception
end
end
context "#download_and_parse_chapters_into_story" do
it "should set the work revision date to the date of the last chapter" do
# Let the test get at external sites, but stub out anything containing "url1" and "url2"
WebMock.allow_net_connect!
WebMock.stub_request(:any, /url1/).
to_return(status: 200, body: "Date: 2001-01-10 13:45\nstubbed response", headers: {})
WebMock.stub_request(:any, /url2/).
to_return(status: 200, body: "Date: 2001-01-22 12:56\nstubbed response", headers: {})
storyparser_user = FactoryBot.create(:user)
urls = %w(http://url1 http://url2)
work = @sp.download_and_parse_chapters_into_story(urls, { pseuds: [storyparser_user.default_pseud], do_not_set_current_author: false })
work.save
actual_date = work.revised_at.to_date
expected_date = Date.new(2001, 1, 22)
expect(actual_date).to eq(expected_date)
end
end
describe "#download_text" do
before do
WebMock.stub_request(:get, "http://example.org/foo")
.to_return(status: 200, body: "the response of the redirect target", headers: {})
end
it "follows relative redirects" do
input_url = "http://example.org/bar"
WebMock.stub_request(:get, input_url)
.to_return(status: 302, headers: { "Location" => "/foo" })
expect(@sp.send(:download_text, input_url)).to eq("the response of the redirect target")
end
it "follows absolute redirects" do
input_url = "http://foo.com/"
WebMock.stub_request(:get, input_url)
.to_return(status: 302, headers: { "Location" => "http://example.org/foo" })
expect(@sp.send(:download_text, input_url)).to eq("the response of the redirect target")
end
end
describe "#parse_common" do
it "converts relative to absolute links" do
# This one doesn't work because the sanitizer is converting the & to &
# ['http://foo.com/bar.html', 'search.php?here=is&a=query'] => 'http://foo.com/search.php?here=is&a=query',
{
['http://foo.com/bar.html', 'thisdir.html'] => 'http://foo.com/thisdir.html',
['http://foo.com/bar.html?hello=foo', 'thisdir.html'] => 'http://foo.com/thisdir.html',
['http://foo.com/bar.html', './thisdir.html'] => 'http://foo.com/thisdir.html',
['http://foo.com/bar.html', 'img.jpg'] => 'http://foo.com/img.jpg',
['http://foo.com/bat/bar.html', '../updir.html'] => 'http://foo.com/updir.html',
['http://foo.com/bar.html', 'http://bar.com/foo.html'] => 'http://bar.com/foo.html',
['http://foo.com/bar.html', 'search.php?hereis=aquery'] => 'http://foo.com/search.php?hereis=aquery',
}.each_pair do |input, output|
location, href = input
story_in = '<html><body><p>here is <a href="' + href + '">a link</a>.</p></body></html>'
story_out = '<p>here is <a href="' + output + '" rel="nofollow">a link</a>.</p>'
results = @sp.parse_common(story_in, location)
expect(results[:chapter_attributes][:content]).to include(story_out)
end
end
it "does NOT convert raw anchor links to absolute links" do
location = "http://external_site"
story_in = "<html><body><p><a href=#local>local href</p></body></html>"
result = @sp.parse_common(story_in, location)
expect(result[:chapter_attributes][:content]).not_to include(location)
expect(result[:chapter_attributes][:content]).to include("#local")
end
end
describe "#parse_author" do
it "returns an external author name when a name and email are provided" do
results = @sp.parse_author("", "Author Name", "author@example.com")
expect(results.name).to eq("Author Name")
expect(results.external_author.email).to eq("author@example.com")
end
it "raises an exception when the external author name is not provided" do
expect do
@sp.parse_author("", nil, "author@example.com")
end.to raise_exception(StoryParser::Error, "No author name specified")
end
it "raises an exception when the external author email is not provided" do
expect do
@sp.parse_author("", "Author Name", nil)
end.to raise_exception(StoryParser::Error, "No author email specified")
end
it "raises an exception when neither the external author name nor email is provided" do
expect do
@sp.parse_author("", nil, nil)
end.to raise_exception(StoryParser::Error, "No author name specified\nNo author email specified")
end
it "gives the same external author object for the same email" do
res1 = @sp.parse_author("", "Author Name", "author@example.com")
res2 = @sp.parse_author("", "Author Name Second", "author@example.com")
res3 = @sp.parse_author("", "Author!! Name!!", "author@example.com")
expect(res2.external_author.id).to eq(res1.external_author.id)
expect(res3.external_author.id).to eq(res1.external_author.id)
expect(res1.name).to eq("Author Name")
expect(res2.name).to eq("Author Name Second")
end
it "ignores the external author name when it is invalid" do
results = @sp.parse_author("", "!!!!", "author@example.com")
expect(results.name).to eq("author@example.com")
expect(results.external_author.email).to eq("author@example.com")
end
it "ignores invalid letters in the external author name" do
results = @sp.parse_author("", "Author!! Name!!", "author@example.com")
expect(results.name).to eq("Author Name")
expect(results.external_author.email).to eq("author@example.com")
end
it "raises an exception when the external author email is invalid" do
expect do
@sp.parse_author("", "Author Name", "not_email")
end.to raise_exception(StoryParser::Error, "Email should look like an email address.")
end
it "raises an exception when the external author name and email are invalid" do
expect do
@sp.parse_author("", "!!!!", "not_email")
end.to raise_exception(StoryParser::Error, "Email should look like an email address.")
end
it "raises an exception when the external author name is blank and email is invalid" do
expect do
@sp.parse_author("", "", "not_email")
end.to raise_exception(StoryParser::Error, "No author name specified\nEmail should look like an email address.")
end
it "raises an exception when the external author name is invalid and email is blank" do
expect do
@sp.parse_author("", "!!!!", "")
end.to raise_exception(StoryParser::Error, "No author email specified")
end
end
# Let the test get at external sites, but stub out anything containing certain keywords
def mock_external
curly_quotes = "String with non-ASCII “Curly quotes” and apostrophes"
body = <<~STUB
Title: #{curly_quotes}
Summary: #{curly_quotes}
Fandom: #{curly_quotes}
Rating: #{curly_quotes}
Warnings: #{curly_quotes}
Characters: #{curly_quotes}
Pairing: Includes a character that broke the importer
Category: #{curly_quotes}
Tags: #{curly_quotes}
Author's notes: #{curly_quotes}
stubbed response
STUB
binary_body = body.clone.force_encoding("ASCII-8BIT")
WebMock.allow_net_connect!
WebMock.stub_request(:any, /ascii-8bit/).
to_return(status: 200,
body: binary_body,
headers: {})
WebMock.stub_request(:any, /utf-8/).
to_return(status: 200,
body: body,
headers: {})
WebMock.stub_request(:any, /win-1252/).
to_return(status: 200,
body: body.encode("Windows-1252"),
headers: {})
WebMock.stub_request(:any, /non-sgml-character-number-3/).
to_return(status: 200,
body: "<body>\0When I get out of here</body>")
end
describe "Import" do
before do
mock_external
@user = create(:user)
end
after do
WebMock.reset!
end
it "does not throw an exception with non-ASCII characters in metadata fields" do
urls = %w[http://ascii-8bit http://utf-8 http://win-1252]
urls.each do |url|
expect do
@sp.download_and_parse_story(url, pseuds: [@user.default_pseud], do_not_set_current_author: false)
end.not_to raise_exception
end
end
it "ignores string terminators (AO3-2251)" do
story = @sp.download_and_parse_story("http://non-sgml-character-number-3", pseuds: [@user.default_pseud])
expect(story.chapters[0].content).to include("When I get out of here")
end
end
end