981 lines
37 KiB
Ruby
981 lines
37 KiB
Ruby
# Parse stories from other websites and uploaded files, looking for metadata to harvest
|
|
# and put into the archive.
|
|
#
|
|
class StoryParser
|
|
require 'timeout'
|
|
require 'nokogiri'
|
|
require 'mechanize'
|
|
require 'open-uri'
|
|
include HtmlCleaner
|
|
|
|
OPTIONAL_META = {notes: 'Note',
|
|
freeform_string: 'Tag',
|
|
fandom_string: 'Fandom',
|
|
rating_string: 'Rating',
|
|
archive_warning_string: 'Warning',
|
|
relationship_string: 'Relationship|Pairing',
|
|
character_string: 'Character' }.freeze
|
|
REQUIRED_META = { title: 'Title',
|
|
summary: 'Summary',
|
|
revised_at: 'Date|Posted|Posted on|Posted at',
|
|
chapter_title: 'Chapter Title' }.freeze
|
|
|
|
# Use this for raising custom error messages
|
|
# (so that we can distinguish them from unexpected exceptions due to
|
|
# faulty code)
|
|
class Error < StandardError
|
|
end
|
|
|
|
# These attributes need to be moved from the work to the chapter
|
|
# format: {work_attribute_name: :chapter_attribute_name} (can be the same)
|
|
CHAPTER_ATTRIBUTES_ONLY = {}
|
|
|
|
# These attributes need to be copied from the work to the chapter
|
|
CHAPTER_ATTRIBUTES_ALSO = { revised_at: :published_at }.freeze
|
|
|
|
### NOTE ON KNOWN SOURCES
|
|
# These lists will stop with the first one it matches, so put more-specific matches
|
|
# towards the front of the list.
|
|
|
|
# places for which we have a custom parse_story_from_[source] method
|
|
# for getting information out of the downloaded text
|
|
KNOWN_STORY_PARSERS = %w[deviantart dw lj].freeze
|
|
|
|
# places for which we have a custom parse_author_from_[source] method
|
|
# which returns an external_author object including an email address
|
|
KNOWN_AUTHOR_PARSERS = %w[lj].freeze
|
|
|
|
# places for which we have a download_story_from_[source]
|
|
# used to customize the downloading process
|
|
KNOWN_STORY_LOCATIONS = %w[lj].freeze
|
|
|
|
# places for which we have a download_chaptered_from
|
|
# to get a set of chapters all together
|
|
CHAPTERED_STORY_LOCATIONS = %w[ffnet thearchive_net efiction quotev].freeze
|
|
|
|
# regular expressions to match against the URLS
|
|
SOURCE_LJ = '((live|dead|insane)journal\.com)|journalfen(\.net|\.com)|dreamwidth\.org'.freeze
|
|
SOURCE_DW = 'dreamwidth\.org'.freeze
|
|
SOURCE_FFNET = '(^|[^A-Za-z0-9-])fanfiction\.net'.freeze
|
|
SOURCE_DEVIANTART = 'deviantart\.com'.freeze
|
|
SOURCE_THEARCHIVE_NET = 'the\-archive\.net'.freeze
|
|
SOURCE_EFICTION = 'viewstory\.php'.freeze
|
|
SOURCE_QUOTEV = 'quotev\.com'.freeze
|
|
|
|
# time out if we can't download fast enough
|
|
STORY_DOWNLOAD_TIMEOUT = 60
|
|
MAX_CHAPTER_COUNT = 200
|
|
|
|
# To check for duplicate chapters, take a slice this long out of the story
|
|
# (in characters)
|
|
DUPLICATE_CHAPTER_LENGTH = 10_000
|
|
|
|
|
|
# Import many stories
|
|
def import_many(urls, options = {})
|
|
# Try to get the works
|
|
works = []
|
|
failed_urls = []
|
|
errors = []
|
|
@options = options
|
|
urls.each do |url|
|
|
begin
|
|
response = download_and_parse_work(url, options)
|
|
work = response[:work]
|
|
if response[:status] == :created
|
|
if work && work.save
|
|
work.chapters.each(&:save)
|
|
works << work
|
|
else
|
|
failed_urls << url
|
|
errors << work.errors.values.join(", ")
|
|
work.delete if work
|
|
end
|
|
elsif response[:status] == :already_imported
|
|
raise StoryParser::Error, response[:message]
|
|
end
|
|
rescue Timeout::Error
|
|
failed_urls << url
|
|
errors << "Import has timed out. This may be due to connectivity problems with the source site. Please try again in a few minutes, or check Known Issues to see if there are import problems with this site."
|
|
work.delete if work
|
|
rescue Error => exception
|
|
failed_urls << url
|
|
errors << "We couldn't successfully import that work, sorry: #{exception.message}"
|
|
work.delete if work
|
|
end
|
|
end
|
|
[works, failed_urls, errors]
|
|
end
|
|
|
|
# Downloads a story and passes it on to the parser.
|
|
# If the URL of the story is from a site for which we have special rules
|
|
# (eg, downloading from a livejournal clone, you want to use ?format=light
|
|
# to get a nice and consistent post format), it will pre-process the url
|
|
# according to the rules for that site.
|
|
def download_and_parse_work(location, options = {})
|
|
status = :created
|
|
message = ""
|
|
work = Work.find_by_url(location)
|
|
if work.nil?
|
|
@options = options
|
|
source = get_source_if_known(CHAPTERED_STORY_LOCATIONS, location)
|
|
if source.nil?
|
|
story = download_text(location)
|
|
work = parse_story(story, location, options)
|
|
else
|
|
work = download_and_parse_chaptered_story(source, location, options)
|
|
end
|
|
else
|
|
status = :already_imported
|
|
message = "A work has already been imported from #{location}."
|
|
end
|
|
{
|
|
status: status,
|
|
message: message,
|
|
work: work
|
|
}
|
|
end
|
|
|
|
# Given an array of urls for chapters of a single story,
|
|
# download them all and combine into a single work
|
|
def import_chapters_into_story(locations, options = {})
|
|
status = :created
|
|
work = Work.find_by_url(locations.first)
|
|
if work.nil?
|
|
chapter_contents = []
|
|
@options = options
|
|
locations.each do |location|
|
|
chapter_contents << download_text(location)
|
|
end
|
|
work = parse_chapters_into_story(locations.first, chapter_contents, options)
|
|
message = "Successfully created work \"" + work.title + "\"."
|
|
else
|
|
status = :already_imported
|
|
message = "A work has already been imported from #{locations.first}."
|
|
end
|
|
{
|
|
status: status,
|
|
message: message,
|
|
work: work
|
|
}
|
|
end
|
|
|
|
|
|
### OLD PARSING METHODS
|
|
|
|
# Import many stories
|
|
def import_from_urls(urls, options = {})
|
|
# Try to get the works
|
|
works = []
|
|
failed_urls = []
|
|
errors = []
|
|
@options = options
|
|
urls.each do |url|
|
|
begin
|
|
work = download_and_parse_story(url, options)
|
|
if work && work.save
|
|
work.chapters.each(&:save)
|
|
works << work
|
|
else
|
|
failed_urls << url
|
|
errors << work.errors.values.join(", ")
|
|
work.delete if work
|
|
end
|
|
rescue Timeout::Error
|
|
failed_urls << url
|
|
errors << "Import has timed out. This may be due to connectivity problems with the source site. Please try again in a few minutes, or check Known Issues to see if there are import problems with this site."
|
|
work.delete if work
|
|
rescue Error => exception
|
|
failed_urls << url
|
|
errors << "We couldn't successfully import that work, sorry: #{exception.message}"
|
|
work.delete if work
|
|
end
|
|
end
|
|
[works, failed_urls, errors]
|
|
end
|
|
|
|
# Downloads a story and passes it on to the parser.
|
|
# If the URL of the story is from a site for which we have special rules
|
|
# (eg, downloading from a livejournal clone, you want to use ?format=light
|
|
# to get a nice and consistent post format), it will pre-process the url
|
|
# according to the rules for that site.
|
|
def download_and_parse_story(location, options = {})
|
|
check_for_previous_import(location)
|
|
@options = options
|
|
source = get_source_if_known(CHAPTERED_STORY_LOCATIONS, location)
|
|
if source.nil?
|
|
story = download_text(location)
|
|
work = parse_story(story, location, options)
|
|
else
|
|
work = download_and_parse_chaptered_story(source, location, options)
|
|
end
|
|
work
|
|
end
|
|
|
|
# Given an array of urls for chapters of a single story,
|
|
# download them all and combine into a single work
|
|
def download_and_parse_chapters_into_story(locations, options = {})
|
|
check_for_previous_import(locations.first)
|
|
chapter_contents = []
|
|
@options = options
|
|
locations.each do |location|
|
|
chapter_contents << download_text(location)
|
|
end
|
|
parse_chapters_into_story(locations.first, chapter_contents, options)
|
|
end
|
|
|
|
### PARSING METHODS
|
|
|
|
# Parses the text of a story, optionally from a given location.
|
|
def parse_story(story, location, options = {})
|
|
work_params = parse_common(story, location, options[:encoding], options[:detect_tags])
|
|
|
|
# move any attributes from work to chapter if necessary
|
|
set_work_attributes(Work.new(work_params), location, options)
|
|
end
|
|
|
|
# parses and adds a new chapter to the end of the work
|
|
def parse_chapter_of_work(work, chapter_content, location, options = {})
|
|
tmp_work_params = parse_common(chapter_content, location, options[:encoding], options[:detect_tags])
|
|
chapter = get_chapter_from_work_params(tmp_work_params)
|
|
work.chapters << set_chapter_attributes(work, chapter)
|
|
work
|
|
end
|
|
|
|
def parse_chapters_into_story(location, chapter_contents, options = {})
|
|
work = nil
|
|
chapter_contents.each do |content|
|
|
work_params = parse_common(content, location, options[:encoding], options[:detect_tags])
|
|
if work.nil?
|
|
# create the new work
|
|
work = Work.new(work_params)
|
|
else
|
|
new_chapter = get_chapter_from_work_params(work_params)
|
|
work.chapters << set_chapter_attributes(work, new_chapter)
|
|
end
|
|
end
|
|
set_work_attributes(work, location, options)
|
|
end
|
|
|
|
# Everything below here is protected and should not be touched by outside
|
|
# code -- please use the above functions to parse external works.
|
|
|
|
protected
|
|
|
|
# tries to create an external author for a given url
|
|
def parse_author(location, ext_author_name, ext_author_email)
|
|
if location.present? && ext_author_name.blank? && ext_author_email.blank?
|
|
source = get_source_if_known(KNOWN_AUTHOR_PARSERS, location)
|
|
if source.nil?
|
|
raise Error, "No external author name or email specified"
|
|
else
|
|
send("parse_author_from_#{source.downcase}", location)
|
|
end
|
|
else
|
|
parse_author_common(ext_author_email, ext_author_name)
|
|
end
|
|
end
|
|
|
|
# download an entire story from an archive type where we know how to parse multi-chaptered works
|
|
# this should only be called from download_and_parse_story
|
|
def download_and_parse_chaptered_story(source, location, options = {})
|
|
chapter_contents = send("download_chaptered_from_#{source.downcase}", location)
|
|
parse_chapters_into_story(location, chapter_contents, options)
|
|
end
|
|
|
|
# our custom url finder checks for previously imported URL in almost any format it may have been presented
|
|
def check_for_previous_import(location)
|
|
if Work.find_by_url(location).present?
|
|
raise Error, "A work has already been imported from #{location}."
|
|
end
|
|
end
|
|
|
|
def set_chapter_attributes(work, chapter)
|
|
chapter.position = work.chapters.length + 1
|
|
chapter.posted = true
|
|
chapter
|
|
end
|
|
|
|
def set_work_attributes(work, location = "", options = {})
|
|
raise Error, "Work could not be downloaded" if work.nil?
|
|
|
|
@options = options
|
|
work.imported_from_url = location
|
|
work.ip_address = options[:ip_address]
|
|
work.expected_number_of_chapters = work.chapters.length
|
|
work.revised_at = work.chapters.last.published_at
|
|
if work.revised_at && work.revised_at.to_date < Date.current
|
|
work.backdate = true
|
|
end
|
|
|
|
# set authors for the works
|
|
pseuds = []
|
|
pseuds << User.current_user.default_pseud unless options[:do_not_set_current_author] || User.current_user.nil?
|
|
pseuds << options[:archivist].default_pseud if options[:archivist]
|
|
pseuds << options[:pseuds] if options[:pseuds]
|
|
pseuds = pseuds.flatten.compact.uniq
|
|
raise Error, "A work must have at least one author specified" if pseuds.empty?
|
|
pseuds.each do |pseud|
|
|
work.creatorships.build(pseud: pseud, enable_notifications: true)
|
|
work.chapters.each do |chapter|
|
|
chapter.creatorships.build(pseud: pseud)
|
|
end
|
|
end
|
|
|
|
# handle importing works for others
|
|
# build an external creatorship for each author
|
|
if options[:importing_for_others]
|
|
external_author_names = options[:external_author_names] || parse_author(location, options[:external_author_name], options[:external_author_email])
|
|
# convert to an array if not already one
|
|
external_author_names = [external_author_names] if external_author_names.is_a?(ExternalAuthorName)
|
|
if options[:external_coauthor_name].present?
|
|
external_author_names << parse_author(location, options[:external_coauthor_name], options[:external_coauthor_email])
|
|
end
|
|
external_author_names.each do |external_author_name|
|
|
next if !external_author_name || external_author_name.external_author.blank?
|
|
if external_author_name.external_author.do_not_import
|
|
# we're not allowed to import works from this address
|
|
raise Error, "Author #{external_author_name.name} at #{external_author_name.external_author.email} does not allow importing their work to this archive."
|
|
end
|
|
work.external_creatorships.build(external_author_name: external_author_name, archivist: (options[:archivist] || User.current_user))
|
|
end
|
|
end
|
|
|
|
# lock to registered users if specified or importing for others
|
|
work.restricted = options[:restricted] || options[:importing_for_others] || false
|
|
|
|
# set comment permissions
|
|
work.comment_permissions = options[:comment_permissions] || "enable_all"
|
|
work.moderated_commenting_enabled = options[:moderated_commenting_enabled] || false
|
|
|
|
# set default values for required tags
|
|
work.fandom_string = meta_or_default(work.fandom_string, options[:fandom], ArchiveConfig.FANDOM_NO_TAG_NAME)
|
|
work.rating_string = meta_or_default(work.rating_string, options[:rating], ArchiveConfig.RATING_DEFAULT_TAG_NAME)
|
|
work.archive_warning_strings = meta_or_default(work.archive_warning_strings, options[:archive_warning], ArchiveConfig.WARNING_DEFAULT_TAG_NAME)
|
|
work.category_string = meta_or_default(work.category_string, options[:category], [])
|
|
work.character_string = meta_or_default(work.character_string, options[:character], [])
|
|
work.relationship_string = meta_or_default(work.relationship_string, options[:relationship], [])
|
|
work.freeform_string = meta_or_default(work.freeform_string, options[:freeform], [])
|
|
|
|
# set default value for title
|
|
work.title = meta_or_default(work.title, options[:title], "Untitled Imported Work")
|
|
work.summary = meta_or_default(work.summary, options[:summary], '')
|
|
work.notes = meta_or_default(work.notes, options[:notes], '')
|
|
|
|
# set collection name if present
|
|
work.collection_names = get_collection_names(options[:collection_names]) if options[:collection_names].present?
|
|
|
|
# set default language (English)
|
|
work.language_id = options[:language_id] || Language.default.id
|
|
|
|
work.posted = true if options[:post_without_preview]
|
|
work.chapters.each do |chapter|
|
|
if chapter.content.length > ArchiveConfig.CONTENT_MAX
|
|
# TODO: eventually: insert a new chapter
|
|
chapter.content.truncate(ArchiveConfig.CONTENT_MAX, omission: "<strong>WARNING: import truncated automatically because chapter was too long! Please add a new chapter for remaining content.</strong>", separator: "</p>")
|
|
elsif chapter.content.empty?
|
|
raise Error, "Chapter #{chapter.position} of \"#{work.title}\" is blank."
|
|
end
|
|
|
|
chapter.posted = true # do not save - causes the chapters to exist even if work doesn't get created!
|
|
end
|
|
work
|
|
end
|
|
|
|
def parse_author_from_lj(location)
|
|
return if location !~ %r{^(?:http:\/\/)?(?<lj_name>[^.]*).(?<site_name>livejournal\.com|dreamwidth\.org|insanejournal\.com|journalfen.net)}
|
|
email = ""
|
|
lj_name = Regexp.last_match[:lj_name]
|
|
site_name = Regexp.last_match[:site_name]
|
|
if lj_name == "community"
|
|
# whups
|
|
post_text = download_text(location)
|
|
doc = Nokogiri.parse(post_text)
|
|
lj_name = doc.xpath("/html/body/div[2]/div/div/div/table/tbody/tr/td[2]/span/a[2]/b").content
|
|
end
|
|
profile_url = "http://#{lj_name}.#{site_name}/profile"
|
|
lj_profile = download_text(profile_url)
|
|
doc = Nokogiri.parse(lj_profile)
|
|
contact = doc.css('div.contact').inner_html
|
|
if contact.present?
|
|
contact.gsub! '<p class="section_body_title">Contact:</p>', ""
|
|
contact.gsub! /<\/?(span|i)>/, ""
|
|
contact.delete! "\n"
|
|
contact.gsub! "<br/>", ""
|
|
if contact =~ /(.*@.*\..*)/
|
|
email = Regexp.last_match[1]
|
|
end
|
|
end
|
|
email = "#{lj_name}@#{site_name}" if email.blank?
|
|
parse_author_common(email, lj_name)
|
|
end
|
|
|
|
def parse_author_from_unknown(_location)
|
|
# for now, nothing
|
|
nil
|
|
end
|
|
|
|
def parse_author_common(email, name)
|
|
errors = []
|
|
|
|
errors << "No author name specified" if name.blank?
|
|
|
|
if email.present?
|
|
external_author = ExternalAuthor.find_or_create_by(email: email)
|
|
errors += external_author.errors.full_messages
|
|
else
|
|
errors << "No author email specified"
|
|
end
|
|
|
|
raise Error, errors.join("\n") if errors.present?
|
|
|
|
# convert to ASCII and strip out invalid characters (everything except alphanumeric characters, _, @ and -)
|
|
redacted_name = name.to_ascii.gsub(/[^\w[ \-@.]]/u, "")
|
|
if redacted_name.present?
|
|
external_author.names.find_or_create_by(name: redacted_name)
|
|
else
|
|
external_author.default_name
|
|
end
|
|
end
|
|
|
|
def get_chapter_from_work_params(work_params)
|
|
@chapter = Chapter.new(work_params[:chapter_attributes])
|
|
# don't override specific chapter params (eg title) with work params
|
|
chapter_params = work_params.delete_if do |name, _param|
|
|
!@chapter.attribute_names.include?(name.to_s) || !@chapter.send(name.to_s).blank?
|
|
end
|
|
@chapter.update(chapter_params)
|
|
@chapter
|
|
end
|
|
|
|
def download_text(location)
|
|
source = get_source_if_known(KNOWN_STORY_LOCATIONS, location)
|
|
if source.nil?
|
|
download_with_timeout(location)
|
|
else
|
|
send("download_from_#{source.downcase}", location)
|
|
end
|
|
end
|
|
|
|
# canonicalize the url for downloading from lj or clones
|
|
def download_from_lj(location)
|
|
url = location
|
|
url.gsub!(/\#(.*)$/, "") # strip off any anchor information
|
|
url.gsub!(/\?(.*)$/, "") # strip off any existing params at the end
|
|
url.gsub!('_', '-') # convert underscores in usernames to hyphens
|
|
url += "?format=light" # go to light format
|
|
text = download_with_timeout(url)
|
|
|
|
if text.match(/adult_check/)
|
|
Timeout::timeout(STORY_DOWNLOAD_TIMEOUT) {
|
|
begin
|
|
agent = Mechanize.new
|
|
url.include?("dreamwidth") ? form = agent.get(url).forms.first : form = agent.get(url).forms.third
|
|
page = agent.submit(form, form.buttons.first) # submits the adult concepts form
|
|
text = page.body.force_encoding(agent.page.encoding)
|
|
rescue
|
|
text = ""
|
|
end
|
|
}
|
|
end
|
|
text
|
|
end
|
|
|
|
# grab all the chapters of the story from ff.net
|
|
def download_chaptered_from_ffnet(_location)
|
|
raise Error, "Sorry, Fanfiction.net does not allow imports from their site."
|
|
end
|
|
|
|
def download_chaptered_from_quotev(_location)
|
|
raise Error, "Sorry, Quotev.com does not allow imports from their site."
|
|
end
|
|
|
|
# this is an efiction archive but it doesn't handle chapters normally
|
|
# best way to handle is to get the full story printable version
|
|
# We have to make it a download-chaptered because otherwise it gets sent to the
|
|
# generic efiction version since chaptered sources are checked first
|
|
def download_chaptered_from_thearchive_net(location)
|
|
if location.match(/^(.*)\/.*viewstory\.php.*[^p]sid=(\d+)($|&)/i)
|
|
location = "#{$1}/viewstory.php?action=printable&psid=#{$2}"
|
|
end
|
|
text = download_with_timeout(location)
|
|
text.sub!('</style>', '</style></head>') unless text.match('</head>')
|
|
[text]
|
|
end
|
|
|
|
# grab all the chapters of a story from an efiction-based site
|
|
def download_chaptered_from_efiction(location)
|
|
chapter_contents = []
|
|
if location.match(/^(?<site>.*)\/.*viewstory\.php.*sid=(?<storyid>\d+)($|&)/i)
|
|
site = Regexp.last_match[:site]
|
|
storyid = Regexp.last_match[:storyid]
|
|
chapnum = 1
|
|
last_body = ""
|
|
Timeout::timeout(STORY_DOWNLOAD_TIMEOUT) do
|
|
loop do
|
|
url = "#{site}/viewstory.php?action=printable&sid=#{storyid}&chapter=#{chapnum}"
|
|
body = download_with_timeout(url)
|
|
# get a section to check that this isn't a duplicate of previous chapter
|
|
body_to_check = body.slice(10, DUPLICATE_CHAPTER_LENGTH)
|
|
if body.nil? || body_to_check == last_body || chapnum > MAX_CHAPTER_COUNT || body.match(/<div class='chaptertitle'> by <\/div>/) || body.match(/Access denied./) || body.match(/Chapter : /)
|
|
break
|
|
end
|
|
# save the value to check for duplicate chapter
|
|
last_body = body_to_check
|
|
|
|
# clean up the broken head in many efiction printable sites
|
|
body.sub!('</style>', '</style></head>') unless body.match('</head>')
|
|
chapter_contents << body
|
|
chapnum += 1
|
|
end
|
|
end
|
|
end
|
|
chapter_contents
|
|
end
|
|
|
|
|
|
# This is the heavy lifter, invoked by all the story and chapter parsers.
|
|
# It takes a single string containing the raw contents of a story, parses it with
|
|
# Nokogiri into the @doc object, and then and calls a subparser.
|
|
#
|
|
# If the story source can be identified as one of the sources we know how to parse in some custom/
|
|
# special way, parse_common calls the customized parse_story_from_[source] method.
|
|
# Otherwise, it falls back to parse_story_from_unknown.
|
|
#
|
|
# This produces a hash equivalent to the params hash that is normally created by the standard work
|
|
# upload form.
|
|
#
|
|
# parse_common then calls sanitize_params (which would also be called on the standard work upload
|
|
# form results) and returns the final sanitized hash.
|
|
#
|
|
def parse_common(story, location = nil, encoding = nil, detect_tags = true)
|
|
work_params = { title: "Untitled Imported Work", chapter_attributes: { content: "" } }
|
|
|
|
# Encode as HTML - the dummy "foo" tag will be stripped out by the sanitizer but forces Nokogiri to
|
|
# preserve line breaks in plain text documents
|
|
# Rescue all errors as Nokogiri complains about things the sanitizer will fix later
|
|
story.prepend("<foo></foo>")
|
|
@doc =
|
|
begin
|
|
Nokogiri::HTML5.parse(story, encoding: encoding)
|
|
rescue StandardError
|
|
Nokogiri::HTML5.parse("")
|
|
end
|
|
|
|
# Try to convert all relative links to absolute
|
|
base = @doc.at_css("base") ? @doc.css("base")[0]["href"] : location.split("?").first
|
|
if base.present?
|
|
@doc.css("a").each do |link|
|
|
next if link["href"].blank? || link["href"].start_with?("#")
|
|
begin
|
|
query = link["href"].match(/(\?.*)$/) ? $1 : ""
|
|
link["href"] = URI.join(base, link["href"].gsub(/(\?.*)$/, "")).to_s + query
|
|
rescue
|
|
# ignored
|
|
end
|
|
end
|
|
end
|
|
|
|
# Extract metadata (unless detect_tags is false)
|
|
if location && (source = get_source_if_known(KNOWN_STORY_PARSERS, location))
|
|
params = send("parse_story_from_#{source.downcase}", story, detect_tags)
|
|
work_params.merge!(params)
|
|
else
|
|
work_params.merge!(parse_story_from_unknown(story, detect_tags))
|
|
end
|
|
|
|
shift_chapter_attributes(sanitize_params(work_params))
|
|
end
|
|
|
|
# our fallback: parse a story from an unknown source, so we have no special
|
|
# rules.
|
|
def parse_story_from_unknown(story, detect_tags = true)
|
|
work_params = { chapter_attributes: {} }
|
|
story_head = ""
|
|
story_head = @doc.css("head").inner_html if @doc.css("head")
|
|
|
|
# Story content - Look for progressively less specific containers or grab everything
|
|
element = @doc.at_css('.chapter-content') || @doc.at_css('body') || @doc.at_css('html') || @doc
|
|
storytext = element ? element.inner_html : story
|
|
|
|
meta = {}
|
|
meta.merge!(scan_text_for_meta(story_head, detect_tags)) unless story_head.blank?
|
|
meta.merge!(scan_text_for_meta(story, detect_tags))
|
|
meta[:title] ||= @doc.css('title').inner_html
|
|
work_params[:chapter_attributes][:title] = meta.delete(:chapter_title)
|
|
work_params[:chapter_attributes][:content] = clean_storytext(storytext)
|
|
work_params.merge!(meta)
|
|
end
|
|
|
|
# Parses a story from livejournal or a livejournal equivalent (eg, dreamwidth, insanejournal)
|
|
# Assumes that we have downloaded the story from one of those equivalents (ie, we've downloaded
|
|
# it in format=light which is a stripped-down plaintext version.)
|
|
#
|
|
def parse_story_from_lj(_story, detect_tags = true)
|
|
work_params = { chapter_attributes: {} }
|
|
|
|
# in LJ "light" format, the story contents are in the second div
|
|
# inside the body.
|
|
body = @doc.css("body")
|
|
storytext = body.css("article.b-singlepost-body").inner_html
|
|
storytext = body.css("div.aentry-post__text").inner_html if storytext.empty?
|
|
storytext = body.inner_html if storytext.empty?
|
|
|
|
# cleanup the text
|
|
# storytext.gsub!(/<br\s*\/?>/i, "\n") # replace the breaks with newlines
|
|
storytext = clean_storytext(storytext)
|
|
|
|
work_params[:chapter_attributes][:content] = storytext
|
|
work_params[:title] = @doc.css("title").inner_html
|
|
work_params[:title].gsub! /^[^:]+: /, ""
|
|
work_params.merge!(scan_text_for_meta(storytext, detect_tags))
|
|
|
|
date = @doc.css("time.b-singlepost-author-date")
|
|
date = @doc.css("p.aentry-head__date/time") if date.empty?
|
|
work_params[:revised_at] = convert_revised_at(date.first.inner_text) unless date.empty?
|
|
|
|
work_params
|
|
end
|
|
|
|
def parse_story_from_dw(_story, detect_tags = true)
|
|
work_params = { chapter_attributes: {} }
|
|
|
|
body = @doc.css("body")
|
|
content_divs = body.css("div.contents")
|
|
|
|
if content_divs[0].present?
|
|
# Get rid of the DW metadata table
|
|
content_divs[0].css("div.currents, ul.entry-management-links, div.header.inner, span.restrictions, h3.entry-title").each(&:remove)
|
|
storytext = content_divs[0].inner_html
|
|
else
|
|
storytext = body.inner_html
|
|
end
|
|
|
|
# cleanup the text
|
|
storytext = clean_storytext(storytext)
|
|
|
|
work_params[:chapter_attributes][:content] = storytext
|
|
work_params[:title] = @doc.css("title").inner_html
|
|
work_params[:title].gsub! /^[^:]+: /, ""
|
|
work_params.merge!(scan_text_for_meta(storytext, detect_tags))
|
|
|
|
font_blocks = @doc.xpath('//font')
|
|
unless font_blocks.empty?
|
|
date = font_blocks.first.inner_text
|
|
work_params[:revised_at] = convert_revised_at(date)
|
|
end
|
|
|
|
# get the date
|
|
date = @doc.css("span.date").inner_text
|
|
work_params[:revised_at] = convert_revised_at(date)
|
|
|
|
work_params
|
|
end
|
|
|
|
def parse_story_from_deviantart(_story, detect_tags = true)
|
|
work_params = { chapter_attributes: {} }
|
|
storytext = ""
|
|
notes = ""
|
|
|
|
body = @doc.css("body")
|
|
title = @doc.css("title").inner_html.gsub /\s*on deviantart$/i, ""
|
|
|
|
# Find the image (original size) if it's art
|
|
image_full = body.css("div.dev-view-deviation img.dev-content-full")
|
|
unless image_full[0].nil?
|
|
storytext = "<center><img src=\"#{image_full[0]["src"]}\"></center>"
|
|
end
|
|
|
|
# Find the fic text if it's fic (needs the id for disambiguation, the "deviantART loves you" bit in the footer has the same class path)
|
|
text_table = body.css(".grf-indent > div:nth-child(1)")[0]
|
|
unless text_table.nil?
|
|
# Try to remove some metadata (title and author) from the work's text, if possible
|
|
# Try to remove the title: if it exists, and if it's the same as the browser title
|
|
if text_table.css("h1")[0].present? && title && title.match(text_table.css("h1")[0].text)
|
|
text_table.css("h1")[0].remove
|
|
end
|
|
|
|
# Try to remove the author: if it exists, and if it follows a certain pattern
|
|
if text_table.css("small")[0].present? && text_table.css("small")[0].inner_html.match(/by ~.*?<a class="u" href=/m)
|
|
text_table.css("small")[0].remove
|
|
end
|
|
storytext = text_table.inner_html
|
|
end
|
|
|
|
# cleanup the text
|
|
storytext.gsub!(%r{<br\s*\/?>}i, "\n") # replace the breaks with newlines
|
|
storytext = clean_storytext(storytext)
|
|
work_params[:chapter_attributes][:content] = storytext
|
|
|
|
# Find the notes
|
|
content_divs = body.css("div.text-ctrl div.text")
|
|
notes = content_divs[0].inner_html unless content_divs[0].nil?
|
|
|
|
# cleanup the notes
|
|
notes.gsub!(%r{<br\s*\/?>}i, "\n") # replace the breaks with newlines
|
|
notes = clean_storytext(notes, "notes")
|
|
work_params[:notes] = notes
|
|
|
|
work_params.merge!(scan_text_for_meta(notes, detect_tags))
|
|
work_params[:title] = title
|
|
|
|
body.css("div.dev-title-container h1 a").each do |node|
|
|
if node["class"] != "u"
|
|
work_params[:title] = node.inner_html
|
|
end
|
|
end
|
|
|
|
tags = []
|
|
@doc.css("div.dev-about-cat-cc a.h").each { |node| tags << node.inner_html }
|
|
work_params[:freeform_string] = clean_tags(tags.join(ArchiveConfig.DELIMITER_FOR_OUTPUT))
|
|
|
|
details = @doc.css("div.dev-right-bar-content span[title]")
|
|
unless details[0].nil?
|
|
work_params[:revised_at] = convert_revised_at(details[0].inner_text)
|
|
end
|
|
|
|
work_params
|
|
end
|
|
|
|
# Move and/or copy any meta attributes that need to be on the chapter rather
|
|
# than on the work itself
|
|
def shift_chapter_attributes(work_params)
|
|
CHAPTER_ATTRIBUTES_ONLY.each_pair do |work_attrib, chapter_attrib|
|
|
if work_params[work_attrib] && !work_params[:chapter_attributes][chapter_attrib]
|
|
work_params[:chapter_attributes][chapter_attrib] = work_params[work_attrib]
|
|
work_params.delete(work_attrib)
|
|
end
|
|
end
|
|
|
|
# copy any attributes from work to chapter as necessary
|
|
CHAPTER_ATTRIBUTES_ALSO.each_pair do |work_attrib, chapter_attrib|
|
|
if work_params[work_attrib] && !work_params[:chapter_attributes][chapter_attrib]
|
|
work_params[:chapter_attributes][chapter_attrib] = work_params[work_attrib]
|
|
end
|
|
end
|
|
|
|
work_params
|
|
end
|
|
|
|
# Find any cases of the given pieces of meta in the given text
|
|
# and return a hash of meta values
|
|
def scan_text_for_meta(text, detect_tags = true)
|
|
# break up the text with some extra newlines to make matching more likely
|
|
# and strip out some tags
|
|
text = text.gsub(/<br/, "\n<br")
|
|
text.gsub!(/<p/, "\n<p")
|
|
text.gsub!(/<\/?(label|span|div|b)(.*?)?>/, '')
|
|
|
|
meta = {}
|
|
metapatterns = detect_tags ? REQUIRED_META.merge(OPTIONAL_META) : REQUIRED_META
|
|
is_tag = {}.tap do |h|
|
|
%w[fandom_string relationship_string freeform_string rating_string archive_warning_string].each do |c|
|
|
h[c.to_sym] = true
|
|
end
|
|
end
|
|
handler = {}.tap do |h|
|
|
%w[rating_string revised_at].each do |c|
|
|
h[c.to_sym] = "convert_#{c.to_s.downcase}"
|
|
end
|
|
end
|
|
|
|
# 1. Look for Pattern: (whatever), optionally followed by a closing p or div tag
|
|
# 2. Set meta[:metaname] = whatever
|
|
# eg, if it finds Fandom: Stargate SG-1 it will set meta[:fandom] = Stargate SG-1
|
|
# 3. convert_<metaname> for cleanup if such a function is defined (eg convert_rating_string)
|
|
metapatterns.each do |metaname, pattern|
|
|
metapattern = Regexp.new("(?:#{pattern}|#{pattern.pluralize})\s*:\s*(.*?)(?:</(?:p|div)>)?$", Regexp::IGNORECASE)
|
|
if text.match(metapattern)
|
|
value = Regexp.last_match[1]
|
|
value = clean_tags(value) if is_tag[metaname]
|
|
value = clean_close_html_tags(value)
|
|
value.strip! # lose leading/trailing whitespace
|
|
value = send(handler[metaname], value) if handler[metaname]
|
|
|
|
meta[metaname] = value
|
|
end
|
|
end
|
|
post_process_meta meta
|
|
end
|
|
|
|
def download_with_timeout(location, limit = 10)
|
|
story = ""
|
|
Timeout.timeout(STORY_DOWNLOAD_TIMEOUT) do
|
|
begin
|
|
# we do a little cleanup here in case the user hasn't included the 'http://'
|
|
# or if they've used capital letters or an underscore in the hostname
|
|
uri = UrlFormatter.new(location).standardized
|
|
raise Error, I18n.t("story_parser.on_archive") if ArchiveConfig.PERMITTED_HOSTS.include?(uri.host)
|
|
|
|
response = Net::HTTP.get_response(uri)
|
|
case response
|
|
when Net::HTTPSuccess
|
|
story = response.body
|
|
when Net::HTTPRedirection
|
|
if limit.positive?
|
|
new_uri = URI.parse(response["location"])
|
|
new_uri = URI.join(uri, new_uri) if new_uri.relative?
|
|
story = download_with_timeout(new_uri.to_s, limit - 1)
|
|
end
|
|
else
|
|
Rails.logger.error("------- STORY PARSER: download_with_timeout: response is not success or redirection ------")
|
|
nil
|
|
end
|
|
rescue Errno::ECONNREFUSED, SocketError, EOFError => e
|
|
Rails.logger.error("------- STORY PARSER: download_with_timeout: error rescue: \n#{e.inspect} ------")
|
|
nil
|
|
end
|
|
end
|
|
if story.blank?
|
|
raise Error, "We couldn't download anything from #{location}. Please make sure that the URL is correct and complete, and try again."
|
|
end
|
|
|
|
# clean up any erroneously included string terminator (AO3-2251)
|
|
story.delete("\000")
|
|
end
|
|
|
|
def get_last_modified(location)
|
|
Timeout.timeout(STORY_DOWNLOAD_TIMEOUT) do
|
|
resp = open(location)
|
|
resp.last_modified
|
|
end
|
|
end
|
|
|
|
def get_source_if_known(known_sources, location)
|
|
known_sources.each do |source|
|
|
pattern = Regexp.new(eval("SOURCE_#{source.upcase}"), Regexp::IGNORECASE)
|
|
return source if location.match(pattern)
|
|
end
|
|
nil
|
|
end
|
|
|
|
def clean_close_html_tags(value)
|
|
# if there are any closing html tags at the start of the value let's ditch them
|
|
value.gsub(/^(\s*<\/[^>]+>)+/, '')
|
|
end
|
|
|
|
# We clean the text as if it had been submitted as the content of a chapter
|
|
def clean_storytext(storytext, field = "content")
|
|
storytext = storytext.encode("UTF-8", invalid: :replace, undef: :replace, replace: "") unless storytext.encoding.name == "UTF-8"
|
|
sanitize_value(field, storytext)
|
|
end
|
|
|
|
# works conservatively -- doesn't split on
|
|
# spaces and truncates instead.
|
|
def clean_tags(tags)
|
|
tags = Sanitize.clean(tags.force_encoding("UTF-8")) # no html allowed in tags
|
|
tags_list = tags =~ /,/ ? tags.split(/,/) : [tags]
|
|
new_list = []
|
|
tags_list.each do |tag|
|
|
tag.gsub!(/[*<>]/, '')
|
|
tag = truncate_on_word_boundary(tag, ArchiveConfig.TAG_MAX)
|
|
new_list << tag unless tag.blank?
|
|
end
|
|
new_list.join(ArchiveConfig.DELIMITER_FOR_OUTPUT)
|
|
end
|
|
|
|
def truncate_on_word_boundary(text, max_length)
|
|
return if text.blank?
|
|
words = text.split
|
|
truncated = words.first
|
|
if words.length > 1
|
|
words[1..words.length].each do |word|
|
|
truncated += " " + word if truncated.length + word.length + 1 <= max_length
|
|
end
|
|
end
|
|
truncated[0..max_length - 1]
|
|
end
|
|
|
|
# convert space-separated tags to comma-separated
|
|
def clean_and_split_tags(tags)
|
|
tags = tags.split(/\s+/).join(',') if !tags.match(/,/) && tags.match(/\s/)
|
|
clean_tags(tags)
|
|
end
|
|
|
|
# Convert the common ratings into whatever ratings we're
|
|
# using on this archive.
|
|
def convert_rating_string(rating)
|
|
rating = rating.downcase
|
|
if rating =~ /^(nc-?1[78]|x|ma|explicit)/
|
|
ArchiveConfig.RATING_EXPLICIT_TAG_NAME
|
|
elsif rating =~ /^(r|m|mature)/
|
|
ArchiveConfig.RATING_MATURE_TAG_NAME
|
|
elsif rating =~ /^(pg-?1[35]|t|teen)/
|
|
ArchiveConfig.RATING_TEEN_TAG_NAME
|
|
elsif rating =~ /^(pg|g|k+|k|general audiences)/
|
|
ArchiveConfig.RATING_GENERAL_TAG_NAME
|
|
else
|
|
ArchiveConfig.RATING_DEFAULT_TAG_NAME
|
|
end
|
|
end
|
|
|
|
def convert_revised_at(date_string)
|
|
begin
|
|
date = nil
|
|
if date_string =~ /^(\d+)$/
|
|
# probably seconds since the epoch
|
|
date = Time.at(Regex.last_match[1].to_i)
|
|
end
|
|
date ||= Date.parse(date_string)
|
|
return '' if date > Date.current
|
|
return date
|
|
rescue ArgumentError, TypeError
|
|
return ''
|
|
end
|
|
end
|
|
|
|
# Additional processing for meta - currently to make sure warnings
|
|
# that aren't Archive warnings become additional tags instead
|
|
def post_process_meta(meta)
|
|
if meta[:archive_warning_string]
|
|
result = process_warnings(meta[:archive_warning_string], meta[:freeform_string])
|
|
meta[:archive_warning_string] = result[:archive_warning_string]
|
|
meta[:freeform_string] = result[:freeform_string]
|
|
end
|
|
meta
|
|
end
|
|
|
|
def process_warnings(warning_string, freeform_string)
|
|
result = {
|
|
archive_warning_string: warning_string,
|
|
freeform_string: freeform_string
|
|
}
|
|
new_warning = ''
|
|
result[:archive_warning_string].split(/\s?,\s?/).each do |warning|
|
|
if ArchiveWarning.warning? warning
|
|
new_warning += ', ' unless new_warning.blank?
|
|
new_warning += warning
|
|
else
|
|
result[:freeform_string] = (result[:freeform_string] || '') + ", #{warning}"
|
|
end
|
|
end
|
|
result[:archive_warning_string] = new_warning
|
|
result
|
|
end
|
|
|
|
# tries to find appropriate existing collections and converts them to comma-separated collection names only
|
|
def get_collection_names(collection_string)
|
|
collections = ""
|
|
collection_string.split(',').map(&:squish).each do |collection_name|
|
|
collection = Collection.find_by(name: collection_name) || Collection.find_by(title: collection_name)
|
|
if collection
|
|
collections += ", " unless collections.blank?
|
|
collections += collection.name
|
|
end
|
|
end
|
|
collections
|
|
end
|
|
|
|
# determine which value to use for a metadata field
|
|
def meta_or_default(detected_field, provided_field, default = nil)
|
|
if @options[:override_tags] || detected_field.blank?
|
|
if provided_field.blank?
|
|
detected_field.blank? ? default : detected_field
|
|
else
|
|
provided_field
|
|
end
|
|
else
|
|
detected_field
|
|
end
|
|
end
|
|
end
|