otwarchive-symphonyarchive/app/models/download_writer.rb

require "open3"

class DownloadWriter
  attr_reader :download, :work

  def initialize(download)
    @download = download
    @work = download.work
  end

  def write
    generate_html_download
    generate_ebook_download unless download.file_type == "html"
    download
  end

  def generate_html
    renderer = ApplicationController.renderer.new(
      http_host: ArchiveConfig.APP_HOST
    )
    renderer.render(
      template: "downloads/show",
      layout: "barebones",
      assigns: {
        work: work,
        page_title: download.page_title,
        chapters: download.chapters
      }
    )
  end

  private

  # Write the HTML version to file
  def generate_html_download
    return if download.exists?

    File.open(download.html_file_path, "w:UTF-8") { |f| f.write(generate_html) }
  end

  # transform HTML version into ebook version
  def generate_ebook_download
    return unless %w[azw3 epub mobi pdf].include?(download.file_type)
    return if download.exists?

    cmds = get_commands

    # Make sure the command is sanitary, and use popen3 in order to
    # capture and discard the stdin/out info
    # See http://stackoverflow.com/a/5970819/469544 for details
    cmds.each do |cmd|
      exit_status = nil
      Open3.popen3(*cmd) { |_stdin, _stdout, _stderr, wait_thread| exit_status = wait_thread.value }
      unless exit_status
        Rails.logger.warn "Download generation failed: " + cmd.to_s
      end
    end
  end

  # Get the version of the command we need to execute
  def get_commands
    [get_web2disk_command, get_zip_command, get_calibre_command]
  end

  # Create the format-specific command-line call to calibre/ebook-convert
  def get_calibre_command
    # Add info about first series if any
    series = []
    if meta[:series_title].present?
      series = ["--series", meta[:series_title],
                "--series-index", meta[:series_position]]
    end

    ### Format-specific options
    # epub: don't generate a cover image
    epub = download.file_type == "epub" ? ["--no-default-epub-cover"] : []

    pdf = []
    if download.file_type == "pdf"
      pdf = [
        # pdf: decrease margins from 72pt default
        "--pdf-page-margin-top", "36",
        "--pdf-page-margin-right", "36",
        "--pdf-page-margin-bottom", "36",
        "--pdf-page-margin-left", "36",
        "--pdf-default-font-size", "17",
        # pdf: only include necessary characters when embedding fonts
        "--subset-embedded-fonts"
      ]
    end

    ### CSS options
    # azw3, epub, and mobi get a special stylesheet
    css = []
    if %w[azw3 epub mobi].include?(download.file_type)
      css = ["--extra-css",
             Rails.public_path.join("stylesheets/ebooks.css").to_s]
    end

    [
      "ebook-convert",
      download.zip_path,
      download.file_path,
      "--input-encoding", "utf-8",
      # Prevent it from turning links to endnotes into entries for the table of
      # contents on works with fewer than the specified number of chapters.
      "--toc-threshold", "0",
      "--use-auto-toc",
      "--title", meta[:title],
      "--title-sort", meta[:sortable_title],
      "--authors", meta[:authors],
      "--author-sort", meta[:sortable_authors],
      "--comments", meta[:summary],
      "--tags", meta[:tags],
      "--pubdate", meta[:pubdate],
      "--publisher", ArchiveConfig.APP_NAME,
      "--language", meta[:language],
      # XPaths for detecting chapters are overly specific to make sure we don't grab
      # anything inputted by the user. First path is for single-chapter works,
      # second for multi-chapter, and third for the preface and afterword
      "--chapter", "//h:body/h:div[@id='chapters']/h:h2[@class='toc-heading'] | //h:body/h:div[@id='chapters']/h:div[@class='meta group']/h:h2[@class='heading'] | //h:body/h:div[@id='preface' or @id='afterword']/h:h2[@class='toc-heading']"
    ] + series + css + epub + pdf
  end

  # Grab the HTML file and any images and put them in --base-dir.
  # --max-recursions 0 prevents it from grabbing all the linked pages.
  # --dont-download-stylesheets isn't strictly necessary for us but avoids
  # creating an empty stylesheets directory.
  def get_web2disk_command
    [
      "web2disk",
      "--base-dir", download.assets_path,
      "--max-recursions", "0",
      "--dont-download-stylesheets",
      "file://#{download.html_file_path}"
    ]
  end

  # Zip the directory containing the HTML file and images.
  def get_zip_command
    [
      "zip",
      "-r",
      download.zip_path,
      download.assets_path
    ]
  end

  # A hash of the work data calibre needs
  def meta
    return @metadata if @metadata
    @metadata = {
      title:             work.title,
      sortable_title:    work.sorted_title,
      # Using ampersands as instructed by Calibre's ebook-convert documentation
      # hides all but the first author name in Books (formerly iBooks). The
      # other authors cannot be used for searching or sorting. Using commas
      # just means Calibre's GUI treats it as one name, e.g. "testy, testy2" is
      # like "Fangirl, Suzy Q", for searching and sorting.
      authors:           download.authors,
      sortable_authors:  work.authors_to_sort_on,
      # We add "Fanworks" because Books uses the first tag as the category and
      # it would otherwise be the work's rating, which is weird.
      tags:              "Fanworks, " + work.tags.pluck(:name).join(", "),
      pubdate:           work.revised_at.to_date.to_s,
      summary:           work.summary.to_s,
      language:          work.language.short
    }
    if work.series.exists?
      series = work.series.first
      @metadata[:series_title] = series.title
      @metadata[:series_position] = series.position_of(work).to_s
    end
    @metadata
  end
end