# Parse stories from other websites and uploaded files, looking for metadata to harvest # and put into the archive. # class StoryParser require 'timeout' require 'nokogiri' require 'mechanize' require 'open-uri' include HtmlCleaner OPTIONAL_META = {notes: 'Note', freeform_string: 'Tag', fandom_string: 'Fandom', rating_string: 'Rating', archive_warning_string: 'Warning', relationship_string: 'Relationship|Pairing', character_string: 'Character' }.freeze REQUIRED_META = { title: 'Title', summary: 'Summary', revised_at: 'Date|Posted|Posted on|Posted at', chapter_title: 'Chapter Title' }.freeze # Use this for raising custom error messages # (so that we can distinguish them from unexpected exceptions due to # faulty code) class Error < StandardError end # These attributes need to be moved from the work to the chapter # format: {work_attribute_name: :chapter_attribute_name} (can be the same) CHAPTER_ATTRIBUTES_ONLY = {} # These attributes need to be copied from the work to the chapter CHAPTER_ATTRIBUTES_ALSO = { revised_at: :published_at }.freeze ### NOTE ON KNOWN SOURCES # These lists will stop with the first one it matches, so put more-specific matches # towards the front of the list. # places for which we have a custom parse_story_from_[source] method # for getting information out of the downloaded text KNOWN_STORY_PARSERS = %w[deviantart dw lj].freeze # places for which we have a custom parse_author_from_[source] method # which returns an external_author object including an email address KNOWN_AUTHOR_PARSERS = %w[lj].freeze # places for which we have a download_story_from_[source] # used to customize the downloading process KNOWN_STORY_LOCATIONS = %w[lj].freeze # places for which we have a download_chaptered_from # to get a set of chapters all together CHAPTERED_STORY_LOCATIONS = %w[ffnet thearchive_net efiction quotev].freeze # regular expressions to match against the URLS SOURCE_LJ = '((live|dead|insane)journal\.com)|journalfen(\.net|\.com)|dreamwidth\.org'.freeze SOURCE_DW = 'dreamwidth\.org'.freeze SOURCE_FFNET = '(^|[^A-Za-z0-9-])fanfiction\.net'.freeze SOURCE_DEVIANTART = 'deviantart\.com'.freeze SOURCE_THEARCHIVE_NET = 'the\-archive\.net'.freeze SOURCE_EFICTION = 'viewstory\.php'.freeze SOURCE_QUOTEV = 'quotev\.com'.freeze # time out if we can't download fast enough STORY_DOWNLOAD_TIMEOUT = 60 MAX_CHAPTER_COUNT = 200 # To check for duplicate chapters, take a slice this long out of the story # (in characters) DUPLICATE_CHAPTER_LENGTH = 10_000 # Import many stories def import_many(urls, options = {}) # Try to get the works works = [] failed_urls = [] errors = [] @options = options urls.each do |url| begin response = download_and_parse_work(url, options) work = response[:work] if response[:status] == :created if work && work.save work.chapters.each(&:save) works << work else failed_urls << url errors << work.errors.values.join(", ") work.delete if work end elsif response[:status] == :already_imported raise StoryParser::Error, response[:message] end rescue Timeout::Error failed_urls << url errors << "Import has timed out. This may be due to connectivity problems with the source site. Please try again in a few minutes, or check Known Issues to see if there are import problems with this site." work.delete if work rescue Error => exception failed_urls << url errors << "We couldn't successfully import that work, sorry: #{exception.message}" work.delete if work end end [works, failed_urls, errors] end # Downloads a story and passes it on to the parser. # If the URL of the story is from a site for which we have special rules # (eg, downloading from a livejournal clone, you want to use ?format=light # to get a nice and consistent post format), it will pre-process the url # according to the rules for that site. def download_and_parse_work(location, options = {}) status = :created message = "" work = Work.find_by_url(location) if work.nil? @options = options source = get_source_if_known(CHAPTERED_STORY_LOCATIONS, location) if source.nil? story = download_text(location) work = parse_story(story, location, options) else work = download_and_parse_chaptered_story(source, location, options) end else status = :already_imported message = "A work has already been imported from #{location}." end { status: status, message: message, work: work } end # Given an array of urls for chapters of a single story, # download them all and combine into a single work def import_chapters_into_story(locations, options = {}) status = :created work = Work.find_by_url(locations.first) if work.nil? chapter_contents = [] @options = options locations.each do |location| chapter_contents << download_text(location) end work = parse_chapters_into_story(locations.first, chapter_contents, options) message = "Successfully created work \"" + work.title + "\"." else status = :already_imported message = "A work has already been imported from #{locations.first}." end { status: status, message: message, work: work } end ### OLD PARSING METHODS # Import many stories def import_from_urls(urls, options = {}) # Try to get the works works = [] failed_urls = [] errors = [] @options = options urls.each do |url| begin work = download_and_parse_story(url, options) if work && work.save work.chapters.each(&:save) works << work else failed_urls << url errors << work.errors.values.join(", ") work.delete if work end rescue Timeout::Error failed_urls << url errors << "Import has timed out. This may be due to connectivity problems with the source site. Please try again in a few minutes, or check Known Issues to see if there are import problems with this site." work.delete if work rescue Error => exception failed_urls << url errors << "We couldn't successfully import that work, sorry: #{exception.message}" work.delete if work end end [works, failed_urls, errors] end # Downloads a story and passes it on to the parser. # If the URL of the story is from a site for which we have special rules # (eg, downloading from a livejournal clone, you want to use ?format=light # to get a nice and consistent post format), it will pre-process the url # according to the rules for that site. def download_and_parse_story(location, options = {}) check_for_previous_import(location) @options = options source = get_source_if_known(CHAPTERED_STORY_LOCATIONS, location) if source.nil? story = download_text(location) work = parse_story(story, location, options) else work = download_and_parse_chaptered_story(source, location, options) end work end # Given an array of urls for chapters of a single story, # download them all and combine into a single work def download_and_parse_chapters_into_story(locations, options = {}) check_for_previous_import(locations.first) chapter_contents = [] @options = options locations.each do |location| chapter_contents << download_text(location) end parse_chapters_into_story(locations.first, chapter_contents, options) end ### PARSING METHODS # Parses the text of a story, optionally from a given location. def parse_story(story, location, options = {}) work_params = parse_common(story, location, options[:encoding], options[:detect_tags]) # move any attributes from work to chapter if necessary set_work_attributes(Work.new(work_params), location, options) end # parses and adds a new chapter to the end of the work def parse_chapter_of_work(work, chapter_content, location, options = {}) tmp_work_params = parse_common(chapter_content, location, options[:encoding], options[:detect_tags]) chapter = get_chapter_from_work_params(tmp_work_params) work.chapters << set_chapter_attributes(work, chapter) work end def parse_chapters_into_story(location, chapter_contents, options = {}) work = nil chapter_contents.each do |content| work_params = parse_common(content, location, options[:encoding], options[:detect_tags]) if work.nil? # create the new work work = Work.new(work_params) else new_chapter = get_chapter_from_work_params(work_params) work.chapters << set_chapter_attributes(work, new_chapter) end end set_work_attributes(work, location, options) end # Everything below here is protected and should not be touched by outside # code -- please use the above functions to parse external works. protected # tries to create an external author for a given url def parse_author(location, ext_author_name, ext_author_email) if location.present? && ext_author_name.blank? && ext_author_email.blank? source = get_source_if_known(KNOWN_AUTHOR_PARSERS, location) if source.nil? raise Error, "No external author name or email specified" else send("parse_author_from_#{source.downcase}", location) end else parse_author_common(ext_author_email, ext_author_name) end end # download an entire story from an archive type where we know how to parse multi-chaptered works # this should only be called from download_and_parse_story def download_and_parse_chaptered_story(source, location, options = {}) chapter_contents = send("download_chaptered_from_#{source.downcase}", location) parse_chapters_into_story(location, chapter_contents, options) end # our custom url finder checks for previously imported URL in almost any format it may have been presented def check_for_previous_import(location) if Work.find_by_url(location).present? raise Error, "A work has already been imported from #{location}." end end def set_chapter_attributes(work, chapter) chapter.position = work.chapters.length + 1 chapter.posted = true chapter end def set_work_attributes(work, location = "", options = {}) raise Error, "Work could not be downloaded" if work.nil? @options = options work.imported_from_url = location work.ip_address = options[:ip_address] work.expected_number_of_chapters = work.chapters.length work.revised_at = work.chapters.last.published_at if work.revised_at && work.revised_at.to_date < Date.current work.backdate = true end # set authors for the works pseuds = [] pseuds << User.current_user.default_pseud unless options[:do_not_set_current_author] || User.current_user.nil? pseuds << options[:archivist].default_pseud if options[:archivist] pseuds << options[:pseuds] if options[:pseuds] pseuds = pseuds.flatten.compact.uniq raise Error, "A work must have at least one author specified" if pseuds.empty? pseuds.each do |pseud| work.creatorships.build(pseud: pseud, enable_notifications: true) work.chapters.each do |chapter| chapter.creatorships.build(pseud: pseud) end end # handle importing works for others # build an external creatorship for each author if options[:importing_for_others] external_author_names = options[:external_author_names] || parse_author(location, options[:external_author_name], options[:external_author_email]) # convert to an array if not already one external_author_names = [external_author_names] if external_author_names.is_a?(ExternalAuthorName) if options[:external_coauthor_name].present? external_author_names << parse_author(location, options[:external_coauthor_name], options[:external_coauthor_email]) end external_author_names.each do |external_author_name| next if !external_author_name || external_author_name.external_author.blank? if external_author_name.external_author.do_not_import # we're not allowed to import works from this address raise Error, "Author #{external_author_name.name} at #{external_author_name.external_author.email} does not allow importing their work to this archive." end work.external_creatorships.build(external_author_name: external_author_name, archivist: (options[:archivist] || User.current_user)) end end # lock to registered users if specified or importing for others work.restricted = options[:restricted] || options[:importing_for_others] || false # set comment permissions work.comment_permissions = options[:comment_permissions] || "enable_all" work.moderated_commenting_enabled = options[:moderated_commenting_enabled] || false # set default values for required tags work.fandom_string = meta_or_default(work.fandom_string, options[:fandom], ArchiveConfig.FANDOM_NO_TAG_NAME) work.rating_string = meta_or_default(work.rating_string, options[:rating], ArchiveConfig.RATING_DEFAULT_TAG_NAME) work.archive_warning_strings = meta_or_default(work.archive_warning_strings, options[:archive_warning], ArchiveConfig.WARNING_DEFAULT_TAG_NAME) work.category_string = meta_or_default(work.category_string, options[:category], []) work.character_string = meta_or_default(work.character_string, options[:character], []) work.relationship_string = meta_or_default(work.relationship_string, options[:relationship], []) work.freeform_string = meta_or_default(work.freeform_string, options[:freeform], []) # set default value for title work.title = meta_or_default(work.title, options[:title], "Untitled Imported Work") work.summary = meta_or_default(work.summary, options[:summary], '') work.notes = meta_or_default(work.notes, options[:notes], '') # set collection name if present work.collection_names = get_collection_names(options[:collection_names]) if options[:collection_names].present? # set default language (English) work.language_id = options[:language_id] || Language.default.id work.posted = true if options[:post_without_preview] work.chapters.each do |chapter| if chapter.content.length > ArchiveConfig.CONTENT_MAX # TODO: eventually: insert a new chapter chapter.content.truncate(ArchiveConfig.CONTENT_MAX, omission: "WARNING: import truncated automatically because chapter was too long! Please add a new chapter for remaining content.", separator: "

") elsif chapter.content.empty? raise Error, "Chapter #{chapter.position} of \"#{work.title}\" is blank." end chapter.posted = true # do not save - causes the chapters to exist even if work doesn't get created! end work end def parse_author_from_lj(location) return if location !~ %r{^(?:http:\/\/)?(?[^.]*).(?livejournal\.com|dreamwidth\.org|insanejournal\.com|journalfen.net)} email = "" lj_name = Regexp.last_match[:lj_name] site_name = Regexp.last_match[:site_name] if lj_name == "community" # whups post_text = download_text(location) doc = Nokogiri.parse(post_text) lj_name = doc.xpath("/html/body/div[2]/div/div/div/table/tbody/tr/td[2]/span/a[2]/b").content end profile_url = "http://#{lj_name}.#{site_name}/profile" lj_profile = download_text(profile_url) doc = Nokogiri.parse(lj_profile) contact = doc.css('div.contact').inner_html if contact.present? contact.gsub! '

Contact:

', "" contact.gsub! /<\/?(span|i)>/, "" contact.delete! "\n" contact.gsub! "
", "" if contact =~ /(.*@.*\..*)/ email = Regexp.last_match[1] end end email = "#{lj_name}@#{site_name}" if email.blank? parse_author_common(email, lj_name) end def parse_author_from_unknown(_location) # for now, nothing nil end def parse_author_common(email, name) errors = [] errors << "No author name specified" if name.blank? if email.present? external_author = ExternalAuthor.find_or_create_by(email: email) errors += external_author.errors.full_messages else errors << "No author email specified" end raise Error, errors.join("\n") if errors.present? # convert to ASCII and strip out invalid characters (everything except alphanumeric characters, _, @ and -) redacted_name = name.to_ascii.gsub(/[^\w[ \-@.]]/u, "") if redacted_name.present? external_author.names.find_or_create_by(name: redacted_name) else external_author.default_name end end def get_chapter_from_work_params(work_params) @chapter = Chapter.new(work_params[:chapter_attributes]) # don't override specific chapter params (eg title) with work params chapter_params = work_params.delete_if do |name, _param| !@chapter.attribute_names.include?(name.to_s) || !@chapter.send(name.to_s).blank? end @chapter.update(chapter_params) @chapter end def download_text(location) source = get_source_if_known(KNOWN_STORY_LOCATIONS, location) if source.nil? download_with_timeout(location) else send("download_from_#{source.downcase}", location) end end # canonicalize the url for downloading from lj or clones def download_from_lj(location) url = location url.gsub!(/\#(.*)$/, "") # strip off any anchor information url.gsub!(/\?(.*)$/, "") # strip off any existing params at the end url.gsub!('_', '-') # convert underscores in usernames to hyphens url += "?format=light" # go to light format text = download_with_timeout(url) if text.match(/adult_check/) Timeout::timeout(STORY_DOWNLOAD_TIMEOUT) { begin agent = Mechanize.new url.include?("dreamwidth") ? form = agent.get(url).forms.first : form = agent.get(url).forms.third page = agent.submit(form, form.buttons.first) # submits the adult concepts form text = page.body.force_encoding(agent.page.encoding) rescue text = "" end } end text end # grab all the chapters of the story from ff.net def download_chaptered_from_ffnet(_location) raise Error, "Sorry, Fanfiction.net does not allow imports from their site." end def download_chaptered_from_quotev(_location) raise Error, "Sorry, Quotev.com does not allow imports from their site." end # this is an efiction archive but it doesn't handle chapters normally # best way to handle is to get the full story printable version # We have to make it a download-chaptered because otherwise it gets sent to the # generic efiction version since chaptered sources are checked first def download_chaptered_from_thearchive_net(location) if location.match(/^(.*)\/.*viewstory\.php.*[^p]sid=(\d+)($|&)/i) location = "#{$1}/viewstory.php?action=printable&psid=#{$2}" end text = download_with_timeout(location) text.sub!('', '') unless text.match('') [text] end # grab all the chapters of a story from an efiction-based site def download_chaptered_from_efiction(location) chapter_contents = [] if location.match(/^(?.*)\/.*viewstory\.php.*sid=(?\d+)($|&)/i) site = Regexp.last_match[:site] storyid = Regexp.last_match[:storyid] chapnum = 1 last_body = "" Timeout::timeout(STORY_DOWNLOAD_TIMEOUT) do loop do url = "#{site}/viewstory.php?action=printable&sid=#{storyid}&chapter=#{chapnum}" body = download_with_timeout(url) # get a section to check that this isn't a duplicate of previous chapter body_to_check = body.slice(10, DUPLICATE_CHAPTER_LENGTH) if body.nil? || body_to_check == last_body || chapnum > MAX_CHAPTER_COUNT || body.match(/
by <\/div>/) || body.match(/Access denied./) || body.match(/Chapter : /) break end # save the value to check for duplicate chapter last_body = body_to_check # clean up the broken head in many efiction printable sites body.sub!('', '') unless body.match('') chapter_contents << body chapnum += 1 end end end chapter_contents end # This is the heavy lifter, invoked by all the story and chapter parsers. # It takes a single string containing the raw contents of a story, parses it with # Nokogiri into the @doc object, and then and calls a subparser. # # If the story source can be identified as one of the sources we know how to parse in some custom/ # special way, parse_common calls the customized parse_story_from_[source] method. # Otherwise, it falls back to parse_story_from_unknown. # # This produces a hash equivalent to the params hash that is normally created by the standard work # upload form. # # parse_common then calls sanitize_params (which would also be called on the standard work upload # form results) and returns the final sanitized hash. # def parse_common(story, location = nil, encoding = nil, detect_tags = true) work_params = { title: "Untitled Imported Work", chapter_attributes: { content: "" } } # Encode as HTML - the dummy "foo" tag will be stripped out by the sanitizer but forces Nokogiri to # preserve line breaks in plain text documents # Rescue all errors as Nokogiri complains about things the sanitizer will fix later story.prepend("") @doc = begin Nokogiri::HTML5.parse(story, encoding: encoding) rescue StandardError Nokogiri::HTML5.parse("") end # Try to convert all relative links to absolute base = @doc.at_css("base") ? @doc.css("base")[0]["href"] : location.split("?").first if base.present? @doc.css("a").each do |link| next if link["href"].blank? || link["href"].start_with?("#") begin query = link["href"].match(/(\?.*)$/) ? $1 : "" link["href"] = URI.join(base, link["href"].gsub(/(\?.*)$/, "")).to_s + query rescue # ignored end end end # Extract metadata (unless detect_tags is false) if location && (source = get_source_if_known(KNOWN_STORY_PARSERS, location)) params = send("parse_story_from_#{source.downcase}", story, detect_tags) work_params.merge!(params) else work_params.merge!(parse_story_from_unknown(story, detect_tags)) end shift_chapter_attributes(sanitize_params(work_params)) end # our fallback: parse a story from an unknown source, so we have no special # rules. def parse_story_from_unknown(story, detect_tags = true) work_params = { chapter_attributes: {} } story_head = "" story_head = @doc.css("head").inner_html if @doc.css("head") # Story content - Look for progressively less specific containers or grab everything element = @doc.at_css('.chapter-content') || @doc.at_css('body') || @doc.at_css('html') || @doc storytext = element ? element.inner_html : story meta = {} meta.merge!(scan_text_for_meta(story_head, detect_tags)) unless story_head.blank? meta.merge!(scan_text_for_meta(story, detect_tags)) meta[:title] ||= @doc.css('title').inner_html work_params[:chapter_attributes][:title] = meta.delete(:chapter_title) work_params[:chapter_attributes][:content] = clean_storytext(storytext) work_params.merge!(meta) end # Parses a story from livejournal or a livejournal equivalent (eg, dreamwidth, insanejournal) # Assumes that we have downloaded the story from one of those equivalents (ie, we've downloaded # it in format=light which is a stripped-down plaintext version.) # def parse_story_from_lj(_story, detect_tags = true) work_params = { chapter_attributes: {} } # in LJ "light" format, the story contents are in the second div # inside the body. body = @doc.css("body") storytext = body.css("article.b-singlepost-body").inner_html storytext = body.css("div.aentry-post__text").inner_html if storytext.empty? storytext = body.inner_html if storytext.empty? # cleanup the text # storytext.gsub!(//i, "\n") # replace the breaks with newlines storytext = clean_storytext(storytext) work_params[:chapter_attributes][:content] = storytext work_params[:title] = @doc.css("title").inner_html work_params[:title].gsub! /^[^:]+: /, "" work_params.merge!(scan_text_for_meta(storytext, detect_tags)) date = @doc.css("time.b-singlepost-author-date") date = @doc.css("p.aentry-head__date/time") if date.empty? work_params[:revised_at] = convert_revised_at(date.first.inner_text) unless date.empty? work_params end def parse_story_from_dw(_story, detect_tags = true) work_params = { chapter_attributes: {} } body = @doc.css("body") content_divs = body.css("div.contents") if content_divs[0].present? # Get rid of the DW metadata table content_divs[0].css("div.currents, ul.entry-management-links, div.header.inner, span.restrictions, h3.entry-title").each(&:remove) storytext = content_divs[0].inner_html else storytext = body.inner_html end # cleanup the text storytext = clean_storytext(storytext) work_params[:chapter_attributes][:content] = storytext work_params[:title] = @doc.css("title").inner_html work_params[:title].gsub! /^[^:]+: /, "" work_params.merge!(scan_text_for_meta(storytext, detect_tags)) font_blocks = @doc.xpath('//font') unless font_blocks.empty? date = font_blocks.first.inner_text work_params[:revised_at] = convert_revised_at(date) end # get the date date = @doc.css("span.date").inner_text work_params[:revised_at] = convert_revised_at(date) work_params end def parse_story_from_deviantart(_story, detect_tags = true) work_params = { chapter_attributes: {} } storytext = "" notes = "" body = @doc.css("body") title = @doc.css("title").inner_html.gsub /\s*on deviantart$/i, "" # Find the image (original size) if it's art image_full = body.css("div.dev-view-deviation img.dev-content-full") unless image_full[0].nil? storytext = "
" end # Find the fic text if it's fic (needs the id for disambiguation, the "deviantART loves you" bit in the footer has the same class path) text_table = body.css(".grf-indent > div:nth-child(1)")[0] unless text_table.nil? # Try to remove some metadata (title and author) from the work's text, if possible # Try to remove the title: if it exists, and if it's the same as the browser title if text_table.css("h1")[0].present? && title && title.match(text_table.css("h1")[0].text) text_table.css("h1")[0].remove end # Try to remove the author: if it exists, and if it follows a certain pattern if text_table.css("small")[0].present? && text_table.css("small")[0].inner_html.match(/by ~.*?}i, "\n") # replace the breaks with newlines storytext = clean_storytext(storytext) work_params[:chapter_attributes][:content] = storytext # Find the notes content_divs = body.css("div.text-ctrl div.text") notes = content_divs[0].inner_html unless content_divs[0].nil? # cleanup the notes notes.gsub!(%r{}i, "\n") # replace the breaks with newlines notes = clean_storytext(notes, "notes") work_params[:notes] = notes work_params.merge!(scan_text_for_meta(notes, detect_tags)) work_params[:title] = title body.css("div.dev-title-container h1 a").each do |node| if node["class"] != "u" work_params[:title] = node.inner_html end end tags = [] @doc.css("div.dev-about-cat-cc a.h").each { |node| tags << node.inner_html } work_params[:freeform_string] = clean_tags(tags.join(ArchiveConfig.DELIMITER_FOR_OUTPUT)) details = @doc.css("div.dev-right-bar-content span[title]") unless details[0].nil? work_params[:revised_at] = convert_revised_at(details[0].inner_text) end work_params end # Move and/or copy any meta attributes that need to be on the chapter rather # than on the work itself def shift_chapter_attributes(work_params) CHAPTER_ATTRIBUTES_ONLY.each_pair do |work_attrib, chapter_attrib| if work_params[work_attrib] && !work_params[:chapter_attributes][chapter_attrib] work_params[:chapter_attributes][chapter_attrib] = work_params[work_attrib] work_params.delete(work_attrib) end end # copy any attributes from work to chapter as necessary CHAPTER_ATTRIBUTES_ALSO.each_pair do |work_attrib, chapter_attrib| if work_params[work_attrib] && !work_params[:chapter_attributes][chapter_attrib] work_params[:chapter_attributes][chapter_attrib] = work_params[work_attrib] end end work_params end # Find any cases of the given pieces of meta in the given text # and return a hash of meta values def scan_text_for_meta(text, detect_tags = true) # break up the text with some extra newlines to make matching more likely # and strip out some tags text = text.gsub(/
/, '') meta = {} metapatterns = detect_tags ? REQUIRED_META.merge(OPTIONAL_META) : REQUIRED_META is_tag = {}.tap do |h| %w[fandom_string relationship_string freeform_string rating_string archive_warning_string].each do |c| h[c.to_sym] = true end end handler = {}.tap do |h| %w[rating_string revised_at].each do |c| h[c.to_sym] = "convert_#{c.to_s.downcase}" end end # 1. Look for Pattern: (whatever), optionally followed by a closing p or div tag # 2. Set meta[:metaname] = whatever # eg, if it finds Fandom: Stargate SG-1 it will set meta[:fandom] = Stargate SG-1 # 3. convert_ for cleanup if such a function is defined (eg convert_rating_string) metapatterns.each do |metaname, pattern| metapattern = Regexp.new("(?:#{pattern}|#{pattern.pluralize})\s*:\s*(.*?)(?:)?$", Regexp::IGNORECASE) if text.match(metapattern) value = Regexp.last_match[1] value = clean_tags(value) if is_tag[metaname] value = clean_close_html_tags(value) value.strip! # lose leading/trailing whitespace value = send(handler[metaname], value) if handler[metaname] meta[metaname] = value end end post_process_meta meta end def download_with_timeout(location, limit = 10) story = "" Timeout.timeout(STORY_DOWNLOAD_TIMEOUT) do begin # we do a little cleanup here in case the user hasn't included the 'http://' # or if they've used capital letters or an underscore in the hostname uri = UrlFormatter.new(location).standardized raise Error, I18n.t("story_parser.on_archive") if ArchiveConfig.PERMITTED_HOSTS.include?(uri.host) response = Net::HTTP.get_response(uri) case response when Net::HTTPSuccess story = response.body when Net::HTTPRedirection if limit.positive? new_uri = URI.parse(response["location"]) new_uri = URI.join(uri, new_uri) if new_uri.relative? story = download_with_timeout(new_uri.to_s, limit - 1) end else Rails.logger.error("------- STORY PARSER: download_with_timeout: response is not success or redirection ------") nil end rescue Errno::ECONNREFUSED, SocketError, EOFError => e Rails.logger.error("------- STORY PARSER: download_with_timeout: error rescue: \n#{e.inspect} ------") nil end end if story.blank? raise Error, "We couldn't download anything from #{location}. Please make sure that the URL is correct and complete, and try again." end # clean up any erroneously included string terminator (AO3-2251) story.delete("\000") end def get_last_modified(location) Timeout.timeout(STORY_DOWNLOAD_TIMEOUT) do resp = open(location) resp.last_modified end end def get_source_if_known(known_sources, location) known_sources.each do |source| pattern = Regexp.new(eval("SOURCE_#{source.upcase}"), Regexp::IGNORECASE) return source if location.match(pattern) end nil end def clean_close_html_tags(value) # if there are any closing html tags at the start of the value let's ditch them value.gsub(/^(\s*<\/[^>]+>)+/, '') end # We clean the text as if it had been submitted as the content of a chapter def clean_storytext(storytext, field = "content") storytext = storytext.encode("UTF-8", invalid: :replace, undef: :replace, replace: "") unless storytext.encoding.name == "UTF-8" sanitize_value(field, storytext) end # works conservatively -- doesn't split on # spaces and truncates instead. def clean_tags(tags) tags = Sanitize.clean(tags.force_encoding("UTF-8")) # no html allowed in tags tags_list = tags =~ /,/ ? tags.split(/,/) : [tags] new_list = [] tags_list.each do |tag| tag.gsub!(/[*<>]/, '') tag = truncate_on_word_boundary(tag, ArchiveConfig.TAG_MAX) new_list << tag unless tag.blank? end new_list.join(ArchiveConfig.DELIMITER_FOR_OUTPUT) end def truncate_on_word_boundary(text, max_length) return if text.blank? words = text.split truncated = words.first if words.length > 1 words[1..words.length].each do |word| truncated += " " + word if truncated.length + word.length + 1 <= max_length end end truncated[0..max_length - 1] end # convert space-separated tags to comma-separated def clean_and_split_tags(tags) tags = tags.split(/\s+/).join(',') if !tags.match(/,/) && tags.match(/\s/) clean_tags(tags) end # Convert the common ratings into whatever ratings we're # using on this archive. def convert_rating_string(rating) rating = rating.downcase if rating =~ /^(nc-?1[78]|x|ma|explicit)/ ArchiveConfig.RATING_EXPLICIT_TAG_NAME elsif rating =~ /^(r|m|mature)/ ArchiveConfig.RATING_MATURE_TAG_NAME elsif rating =~ /^(pg-?1[35]|t|teen)/ ArchiveConfig.RATING_TEEN_TAG_NAME elsif rating =~ /^(pg|g|k+|k|general audiences)/ ArchiveConfig.RATING_GENERAL_TAG_NAME else ArchiveConfig.RATING_DEFAULT_TAG_NAME end end def convert_revised_at(date_string) begin date = nil if date_string =~ /^(\d+)$/ # probably seconds since the epoch date = Time.at(Regex.last_match[1].to_i) end date ||= Date.parse(date_string) return '' if date > Date.current return date rescue ArgumentError, TypeError return '' end end # Additional processing for meta - currently to make sure warnings # that aren't Archive warnings become additional tags instead def post_process_meta(meta) if meta[:archive_warning_string] result = process_warnings(meta[:archive_warning_string], meta[:freeform_string]) meta[:archive_warning_string] = result[:archive_warning_string] meta[:freeform_string] = result[:freeform_string] end meta end def process_warnings(warning_string, freeform_string) result = { archive_warning_string: warning_string, freeform_string: freeform_string } new_warning = '' result[:archive_warning_string].split(/\s?,\s?/).each do |warning| if ArchiveWarning.warning? warning new_warning += ', ' unless new_warning.blank? new_warning += warning else result[:freeform_string] = (result[:freeform_string] || '') + ", #{warning}" end end result[:archive_warning_string] = new_warning result end # tries to find appropriate existing collections and converts them to comma-separated collection names only def get_collection_names(collection_string) collections = "" collection_string.split(',').map(&:squish).each do |collection_name| collection = Collection.find_by(name: collection_name) || Collection.find_by(title: collection_name) if collection collections += ", " unless collections.blank? collections += collection.name end end collections end # determine which value to use for a metadata field def meta_or_default(detected_field, provided_field, default = nil) if @options[:override_tags] || detected_field.blank? if provided_field.blank? detected_field.blank? ? default : detected_field else provided_field end else detected_field end end end