otwarchive-symphonyarchive/lib/paragraph_maker.rb

# frozen_string_literals: true

# Traverse a Nokogiri document tree recursively in order to insert linebreaks.
module ParagraphMaker
  extend self

  # Tags that will be stripped by the sanitizer that are inline, and should be
  # wrapped in paragraphs and have their contents left untouched:
  INLINE_INVALID_TAGS = %w[button input label map select textarea].freeze

  # Tags that will be stripped by the sanitizer that are block tags, and should
  # have nearby whitespace stripped:
  BLOCK_INVALID_TAGS = %w[fieldset form].freeze

  # Tags that will be completely removed by the sanitizer, and should have
  # nearby whitespace removed, and their contents left untouched:
  REMOVED_INVALID_TAGS = Sanitize::Config::ARCHIVE[:remove_contents]

  # Tags whose content we don't touch
  TAG_NAMES_TO_SKIP = (%w[
    a abbr acronym address audio dl embed figure h1 h2 h3 h4 h5 h6 hr img ol
    object p pre source summary table track video ul
  ] + INLINE_INVALID_TAGS + REMOVED_INVALID_TAGS).freeze

  # Tags that need to go inside p tags
  TAG_NAMES_TO_WRAP = (%w[
    a abbr acronym b big br cite code del dfn em i img ins kbd q rp rt ruby
    s samp small span strike strong sub sup tt u var
  ] + INLINE_INVALID_TAGS).freeze

  # Tags that can't be inside p tags
  TAG_NAMES_TO_UNWRAP = %w[
    audio details dl figure h1 h2 h3 h4 h5 h6 hr ol p pre source summary table track ul video
  ].freeze

  # Tags before and after which we don't want to convert linebreaks
  # into br's and p's
  TAG_NAMES_STRIP_WHITESPACE = (%w[
    audio blockquote br center details dl div figure figcaption h1 h2 h3 h4 h5 h6 hr ol
    p pre source summary table track ul video
  ] + BLOCK_INVALID_TAGS + REMOVED_INVALID_TAGS).freeze

  private

  # Traverse all nodes from the given root, yielding each node to the given
  # block. Processes nodes that are on the list of TAG_NAMES_TO_SKIP, but
  # doesn't process any of their children.
  def traverse(node, &block)
    block.call(node)

    return if TAG_NAMES_TO_SKIP.include?(node.name)

    node.children.each do |child|
      traverse(child, &block)
    end
  end

  # Checks whether the given node is in a paragraph. This includes nodes where
  # the paragraph is many layers up.
  def in_paragraph?(node)
    return false if node.parent.nil?
    return true if node.parent.name == "p"

    in_paragraph?(node.parent)
  end

  # Strip whitespace before and after tags in the TAG_NAMES_STRIP_WHITESPACE
  # list.
  def strip_whitespace(root)
    traverse(root) do |node|
      next unless node.text? || node.cdata?

      # This text node immediately follows either the closing tag of its
      # previous sibling, or the opening tag of its parent if it has no
      # previous sibling. If we're supposed to ignore whitespace after that
      # tag, we should lstrip to get rid of it.
      node.content = node.content.lstrip if TAG_NAMES_STRIP_WHITESPACE.include?(node.previous_sibling&.name || node.parent&.name)

      # This text node is immediately followed by either the opening tag of its
      # next sibling, or the closing tag of its parent. If that tag is one in
      # the TAG_NAMES_STRIP_WHITESPACE list, we want to rstrip this node.
      node.content = node.content.rstrip if TAG_NAMES_STRIP_WHITESPACE.include?(node.next_sibling&.name || node.parent&.name)

      node.unlink if node.content.empty?
    end
  end

  # Split text nodes when they have newlines:
  def split_text_at_newlines(root)
    traverse(root) do |node|
      next unless node.text?

      pieces = node.to_s.split(/([[:space:]]*\n[[:space:]]*)/)
      next unless pieces.length > 1

      pieces.each do |piece|
        next if piece.empty?

        case piece.count("\n")
        when 0
          # No newlines, it's just a normal piece of text:
          node.add_previous_sibling(piece)
        when 1
          # One newline, replace it with a <br>.
          node.add_previous_sibling("<br>\n")
        when 2
          # Two newlines, replace it with a special tag (to be processed later)
          # that marks the paragraph to be split at this location.
          node.add_previous_sibling("<split>")
        else
          # Three or more newlines, replace it with a special tag (to be
          # processed later) that marks the paragraph to be split at this
          # location, with a <p>&nbsp;</p> paragraph inserted to increase the
          # spacing between lines.
          node.add_previous_sibling("<split long>")
        end
      end

      node.unlink
    end
  end

  # Traverse the tree in search of sequences of <br> tags. We want to find the
  # last <br> tag in the sequence, so that we don't process the sequence more
  # than once. Once we've found the <br> tag that has a prior <br> tag, but no
  # following <br> tag, we can yank out all of the <br> tags in the sequence,
  # counting them as we go.
  def merge_br_tags(root)
    traverse(root) do |node|
      next unless node.name == "br" &&
                  node.next_sibling&.name != "br" &&
                  node.previous_sibling&.name == "br"

      break_count = 1
      while node.previous_sibling&.name == "br"
        node.previous_sibling.unlink
        break_count += 1
      end

      if break_count == 2
        # Replace the <br> sequence with a special tag (to be processed later)
        # that indicates that whatever paragraph we're in needs to be split at
        # this point. This corresponds to having two <br> tags in a row.
        node.replace("<split>")
      else
        # Replace the <br> sequence with a special tag (to be processed later)
        # that indicates that whatever paragraph we're in needs to be split at
        # this point, and there needs to be a <p>&nbsp;</p> paragraph inserted
        # into the split. This corresponds to having 3+ <br> tags in a row.
        node.replace("<split long>")
      end
    end
  end

  # Go through all nodes that should be inside paragraph tags, and make sure
  # they're wrapped:
  def wrap_all(root)
    # We don't use traverse(root) here to handle the traversal because we're
    # doing something much more complex than usual with our children.
    return if TAG_NAMES_TO_SKIP.include?(root.name) || in_paragraph?(root) || root.name == "p"

    # Divide up the list of children into sublists based on whether the node
    # needs to have a paragraph wrapped around it, or not.
    chunks = root.children.chunk do |node|
      node.text? || node.cdata? || TAG_NAMES_TO_WRAP.include?(node.name)
    end

    chunks.each do |should_wrap, children|
      if should_wrap
        # Create a paragraph object, and then transfer all of the children in
        # this chunk to it.
        paragraph = root.document.create_element("p")
        children.first.add_previous_sibling(paragraph)
        children.each do |node|
          paragraph.add_child(node)
        end
      else
        # These nodes don't need to be wrapped in a paragraph node, but some of
        # their children might, so we have to process them.
        children.each do |node|
          wrap_all(node)
        end
      end
    end
  end

  # Given a node, "split" its parent at that node and move the node upwards one
  # level -- that is, split its siblings by whether they're left/right of the
  # node, and split the parent into two copies, one containing the left siblings
  # and one containing the right siblings.
  #
  # For example, if we have something like this:
  #   <ul class="actions">
  #     <li>First</li>
  #     <li>Second</li>
  #     <li>Third</li>
  #   </ul>
  # Then calling split_parent on the second list item will produce this:
  #   <ul class="actions">
  #     <li>First</li>
  #   </ul>
  #   <li>Second</li>
  #   <ul class="actions">
  #     <li>Third</li>
  #   </ul>
  def split_parent(node)
    # The easy cases: if this node is the first or last node in its parent, then
    # there's no point in a proper split, because one of the two tags would just
    # be empty. Instead, we rearrange ourselves relative to our parent.
    return node.parent.add_previous_sibling(node) if node.previous_sibling.nil?
    return node.parent.add_next_sibling(node) if node.next_sibling.nil?

    # Create a shallow copy of the parent:
    new_parent = node.parent.dup(0)

    # Copy over all of the attributes:
    node.parent.attributes.each do |name, attribute|
      new_parent.set_attribute(name, attribute.value)
    end

    # Put the new parent in the right place:
    node.parent.add_next_sibling(new_parent)

    # Move over all of the siblings:
    new_parent.add_child(node.next_sibling) until node.next_sibling.nil?

    # Put the node in the right place:
    node.parent.add_next_sibling(node)
  end

  # Remove the given node from its containing paragraph tag, if it has one.
  def extract_from_paragraph(node)
    split_parent(node) while in_paragraph?(node)
  end

  # Go through all nodes that shouldn't be inside paragraph tags, and remove them
  # from paragraph tags.
  def unwrap_all(root)
    traverse(root) do |node|
      extract_from_paragraph(node) if TAG_NAMES_TO_UNWRAP.include?(node.name)
    end
  end

  # Process the <split> tags inserted by split_text_at_newlines and
  # merge_br_tags. Either way, we use extract_from_paragraph to split its
  # parents until we reach the paragraph tag that contains it. If the split is
  # marked as "long," we need to insert a <p>&nbsp;</p> blank paragraph.
  # Reintroduce newlines instead to preserve chunks in separate lines.
  def replace_splits(root)
    root.css("split").each do |node|
      extract_from_paragraph(node)

      if node.attribute("long")
        node.replace("\n<p>&nbsp;</p>\n")
      else
        node.replace("\n")
      end
    end
  end

  # Traverse the tree and delete any paragraph nodes that have no children.
  def delete_empty_paragraphs(root)
    traverse(root) do |node|
      node.unlink if node.name == "p" && node.children.empty?
    end
  end

  public

  # Process the given node to add paragraphs to it and all of its children.
  def process(root)
    strip_whitespace(root)
    split_text_at_newlines(root)
    merge_br_tags(root)
    wrap_all(root)
    unwrap_all(root)
    replace_splits(root)
    delete_empty_paragraphs(root)
  end
end
first 2026-03-11 22:22:11 +00:00			`# frozen_string_literals: true`

			`# Traverse a Nokogiri document tree recursively in order to insert linebreaks.`
			`module ParagraphMaker`
			`extend self`

			`# Tags that will be stripped by the sanitizer that are inline, and should be`
			`# wrapped in paragraphs and have their contents left untouched:`
			`INLINE_INVALID_TAGS = %w[button input label map select textarea].freeze`

			`# Tags that will be stripped by the sanitizer that are block tags, and should`
			`# have nearby whitespace stripped:`
			`BLOCK_INVALID_TAGS = %w[fieldset form].freeze`

			`# Tags that will be completely removed by the sanitizer, and should have`
			`# nearby whitespace removed, and their contents left untouched:`
			`REMOVED_INVALID_TAGS = Sanitize::Config::ARCHIVE[:remove_contents]`

			`# Tags whose content we don't touch`
			`TAG_NAMES_TO_SKIP = (%w[`
			`a abbr acronym address audio dl embed figure h1 h2 h3 h4 h5 h6 hr img ol`
			`object p pre source summary table track video ul`
			`] + INLINE_INVALID_TAGS + REMOVED_INVALID_TAGS).freeze`

			`# Tags that need to go inside p tags`
			`TAG_NAMES_TO_WRAP = (%w[`
			`a abbr acronym b big br cite code del dfn em i img ins kbd q rp rt ruby`
			`s samp small span strike strong sub sup tt u var`
			`] + INLINE_INVALID_TAGS).freeze`

			`# Tags that can't be inside p tags`
			`TAG_NAMES_TO_UNWRAP = %w[`
			`audio details dl figure h1 h2 h3 h4 h5 h6 hr ol p pre source summary table track ul video`
			`].freeze`

			`# Tags before and after which we don't want to convert linebreaks`
			`# into br's and p's`
			`TAG_NAMES_STRIP_WHITESPACE = (%w[`
			`audio blockquote br center details dl div figure figcaption h1 h2 h3 h4 h5 h6 hr ol`
			`p pre source summary table track ul video`
			`] + BLOCK_INVALID_TAGS + REMOVED_INVALID_TAGS).freeze`

			`private`

			`# Traverse all nodes from the given root, yielding each node to the given`
			`# block. Processes nodes that are on the list of TAG_NAMES_TO_SKIP, but`
			`# doesn't process any of their children.`
			`def traverse(node, &block)`
			`block.call(node)`

			`return if TAG_NAMES_TO_SKIP.include?(node.name)`

			`node.children.each do \|child\|`
			`traverse(child, &block)`
			`end`
			`end`

			`# Checks whether the given node is in a paragraph. This includes nodes where`
			`# the paragraph is many layers up.`
			`def in_paragraph?(node)`
			`return false if node.parent.nil?`
			`return true if node.parent.name == "p"`

			`in_paragraph?(node.parent)`
			`end`

			`# Strip whitespace before and after tags in the TAG_NAMES_STRIP_WHITESPACE`
			`# list.`
			`def strip_whitespace(root)`
			`traverse(root) do \|node\|`
			`next unless node.text? \|\| node.cdata?`

			`# This text node immediately follows either the closing tag of its`
			`# previous sibling, or the opening tag of its parent if it has no`
			`# previous sibling. If we're supposed to ignore whitespace after that`
			`# tag, we should lstrip to get rid of it.`
			`node.content = node.content.lstrip if TAG_NAMES_STRIP_WHITESPACE.include?(node.previous_sibling&.name \|\| node.parent&.name)`

			`# This text node is immediately followed by either the opening tag of its`
			`# next sibling, or the closing tag of its parent. If that tag is one in`
			`# the TAG_NAMES_STRIP_WHITESPACE list, we want to rstrip this node.`
			`node.content = node.content.rstrip if TAG_NAMES_STRIP_WHITESPACE.include?(node.next_sibling&.name \|\| node.parent&.name)`

			`node.unlink if node.content.empty?`
			`end`
			`end`

			`# Split text nodes when they have newlines:`
			`def split_text_at_newlines(root)`
			`traverse(root) do \|node\|`
			`next unless node.text?`

			`pieces = node.to_s.split(/([[:space:]]\n[[:space:]])/)`
			`next unless pieces.length > 1`

			`pieces.each do \|piece\|`
			`next if piece.empty?`

			`case piece.count("\n")`
			`when 0`
			`# No newlines, it's just a normal piece of text:`
			`node.add_previous_sibling(piece)`
			`when 1`
			`# One newline, replace it with a <br>.`
			`node.add_previous_sibling("<br>\n")`
			`when 2`
			`# Two newlines, replace it with a special tag (to be processed later)`
			`# that marks the paragraph to be split at this location.`
			`node.add_previous_sibling("<split>")`
			`else`
			`# Three or more newlines, replace it with a special tag (to be`
			`# processed later) that marks the paragraph to be split at this`
			`# location, with a <p> </p> paragraph inserted to increase the`
			`# spacing between lines.`
			`node.add_previous_sibling("<split long>")`
			`end`
			`end`

			`node.unlink`
			`end`
			`end`

			`# Traverse the tree in search of sequences of <br> tags. We want to find the`
			`# last <br> tag in the sequence, so that we don't process the sequence more`
			`# than once. Once we've found the <br> tag that has a prior <br> tag, but no`
			`# following <br> tag, we can yank out all of the <br> tags in the sequence,`
			`# counting them as we go.`
			`def merge_br_tags(root)`
			`traverse(root) do \|node\|`
			`next unless node.name == "br" &&`
			`node.next_sibling&.name != "br" &&`
			`node.previous_sibling&.name == "br"`

			`break_count = 1`
			`while node.previous_sibling&.name == "br"`
			`node.previous_sibling.unlink`
			`break_count += 1`
			`end`

			`if break_count == 2`
			`# Replace the <br> sequence with a special tag (to be processed later)`
			`# that indicates that whatever paragraph we're in needs to be split at`
			`# this point. This corresponds to having two <br> tags in a row.`
			`node.replace("<split>")`
			`else`
			`# Replace the <br> sequence with a special tag (to be processed later)`
			`# that indicates that whatever paragraph we're in needs to be split at`
			`# this point, and there needs to be a <p> </p> paragraph inserted`
			`# into the split. This corresponds to having 3+ <br> tags in a row.`
			`node.replace("<split long>")`
			`end`
			`end`
			`end`

			`# Go through all nodes that should be inside paragraph tags, and make sure`
			`# they're wrapped:`
			`def wrap_all(root)`
			`# We don't use traverse(root) here to handle the traversal because we're`
			`# doing something much more complex than usual with our children.`
			`return if TAG_NAMES_TO_SKIP.include?(root.name) \|\| in_paragraph?(root) \|\| root.name == "p"`

			`# Divide up the list of children into sublists based on whether the node`
			`# needs to have a paragraph wrapped around it, or not.`
			`chunks = root.children.chunk do \|node\|`
			`node.text? \|\| node.cdata? \|\| TAG_NAMES_TO_WRAP.include?(node.name)`
			`end`

			`chunks.each do \|should_wrap, children\|`
			`if should_wrap`
			`# Create a paragraph object, and then transfer all of the children in`
			`# this chunk to it.`
			`paragraph = root.document.create_element("p")`
			`children.first.add_previous_sibling(paragraph)`
			`children.each do \|node\|`
			`paragraph.add_child(node)`
			`end`
			`else`
			`# These nodes don't need to be wrapped in a paragraph node, but some of`
			`# their children might, so we have to process them.`
			`children.each do \|node\|`
			`wrap_all(node)`
			`end`
			`end`
			`end`
			`end`

			`# Given a node, "split" its parent at that node and move the node upwards one`
			`# level -- that is, split its siblings by whether they're left/right of the`
			`# node, and split the parent into two copies, one containing the left siblings`
			`# and one containing the right siblings.`
			`#`
			`# For example, if we have something like this:`
			`# <ul class="actions">`
			`# <li>First</li>`
			`# <li>Second</li>`
			`# <li>Third</li>`
			`# </ul>`
			`# Then calling split_parent on the second list item will produce this:`
			`# <ul class="actions">`
			`# <li>First</li>`
			`# </ul>`
			`# <li>Second</li>`
			`# <ul class="actions">`
			`# <li>Third</li>`
			`# </ul>`
			`def split_parent(node)`
			`# The easy cases: if this node is the first or last node in its parent, then`
			`# there's no point in a proper split, because one of the two tags would just`
			`# be empty. Instead, we rearrange ourselves relative to our parent.`
			`return node.parent.add_previous_sibling(node) if node.previous_sibling.nil?`
			`return node.parent.add_next_sibling(node) if node.next_sibling.nil?`

			`# Create a shallow copy of the parent:`
			`new_parent = node.parent.dup(0)`

			`# Copy over all of the attributes:`
			`node.parent.attributes.each do \|name, attribute\|`
			`new_parent.set_attribute(name, attribute.value)`
			`end`

			`# Put the new parent in the right place:`
			`node.parent.add_next_sibling(new_parent)`

			`# Move over all of the siblings:`
			`new_parent.add_child(node.next_sibling) until node.next_sibling.nil?`

			`# Put the node in the right place:`
			`node.parent.add_next_sibling(node)`
			`end`

			`# Remove the given node from its containing paragraph tag, if it has one.`
			`def extract_from_paragraph(node)`
			`split_parent(node) while in_paragraph?(node)`
			`end`

			`# Go through all nodes that shouldn't be inside paragraph tags, and remove them`
			`# from paragraph tags.`
			`def unwrap_all(root)`
			`traverse(root) do \|node\|`
			`extract_from_paragraph(node) if TAG_NAMES_TO_UNWRAP.include?(node.name)`
			`end`
			`end`

			`# Process the <split> tags inserted by split_text_at_newlines and`
			`# merge_br_tags. Either way, we use extract_from_paragraph to split its`
			`# parents until we reach the paragraph tag that contains it. If the split is`
			`# marked as "long," we need to insert a <p> </p> blank paragraph.`
			`# Reintroduce newlines instead to preserve chunks in separate lines.`
			`def replace_splits(root)`
			`root.css("split").each do \|node\|`
			`extract_from_paragraph(node)`

			`if node.attribute("long")`
			`node.replace("\n<p> </p>\n")`
			`else`
			`node.replace("\n")`
			`end`
			`end`
			`end`

			`# Traverse the tree and delete any paragraph nodes that have no children.`
			`def delete_empty_paragraphs(root)`
			`traverse(root) do \|node\|`
			`node.unlink if node.name == "p" && node.children.empty?`
			`end`
			`end`

			`public`

			`# Process the given node to add paragraphs to it and all of its children.`
			`def process(root)`
			`strip_whitespace(root)`
			`split_text_at_newlines(root)`
			`merge_br_tags(root)`
			`wrap_all(root)`
			`unwrap_all(root)`
			`replace_splits(root)`
			`delete_empty_paragraphs(root)`
			`end`
			`end`