otwarchive-symphonyarchive/lib/paragraph_maker.rb

281 lines
10 KiB
Ruby
Raw Permalink Normal View History

2026-03-11 22:22:11 +00:00
# frozen_string_literals: true
# Traverse a Nokogiri document tree recursively in order to insert linebreaks.
module ParagraphMaker
extend self
# Tags that will be stripped by the sanitizer that are inline, and should be
# wrapped in paragraphs and have their contents left untouched:
INLINE_INVALID_TAGS = %w[button input label map select textarea].freeze
# Tags that will be stripped by the sanitizer that are block tags, and should
# have nearby whitespace stripped:
BLOCK_INVALID_TAGS = %w[fieldset form].freeze
# Tags that will be completely removed by the sanitizer, and should have
# nearby whitespace removed, and their contents left untouched:
REMOVED_INVALID_TAGS = Sanitize::Config::ARCHIVE[:remove_contents]
# Tags whose content we don't touch
TAG_NAMES_TO_SKIP = (%w[
a abbr acronym address audio dl embed figure h1 h2 h3 h4 h5 h6 hr img ol
object p pre source summary table track video ul
] + INLINE_INVALID_TAGS + REMOVED_INVALID_TAGS).freeze
# Tags that need to go inside p tags
TAG_NAMES_TO_WRAP = (%w[
a abbr acronym b big br cite code del dfn em i img ins kbd q rp rt ruby
s samp small span strike strong sub sup tt u var
] + INLINE_INVALID_TAGS).freeze
# Tags that can't be inside p tags
TAG_NAMES_TO_UNWRAP = %w[
audio details dl figure h1 h2 h3 h4 h5 h6 hr ol p pre source summary table track ul video
].freeze
# Tags before and after which we don't want to convert linebreaks
# into br's and p's
TAG_NAMES_STRIP_WHITESPACE = (%w[
audio blockquote br center details dl div figure figcaption h1 h2 h3 h4 h5 h6 hr ol
p pre source summary table track ul video
] + BLOCK_INVALID_TAGS + REMOVED_INVALID_TAGS).freeze
private
# Traverse all nodes from the given root, yielding each node to the given
# block. Processes nodes that are on the list of TAG_NAMES_TO_SKIP, but
# doesn't process any of their children.
def traverse(node, &block)
block.call(node)
return if TAG_NAMES_TO_SKIP.include?(node.name)
node.children.each do |child|
traverse(child, &block)
end
end
# Checks whether the given node is in a paragraph. This includes nodes where
# the paragraph is many layers up.
def in_paragraph?(node)
return false if node.parent.nil?
return true if node.parent.name == "p"
in_paragraph?(node.parent)
end
# Strip whitespace before and after tags in the TAG_NAMES_STRIP_WHITESPACE
# list.
def strip_whitespace(root)
traverse(root) do |node|
next unless node.text? || node.cdata?
# This text node immediately follows either the closing tag of its
# previous sibling, or the opening tag of its parent if it has no
# previous sibling. If we're supposed to ignore whitespace after that
# tag, we should lstrip to get rid of it.
node.content = node.content.lstrip if TAG_NAMES_STRIP_WHITESPACE.include?(node.previous_sibling&.name || node.parent&.name)
# This text node is immediately followed by either the opening tag of its
# next sibling, or the closing tag of its parent. If that tag is one in
# the TAG_NAMES_STRIP_WHITESPACE list, we want to rstrip this node.
node.content = node.content.rstrip if TAG_NAMES_STRIP_WHITESPACE.include?(node.next_sibling&.name || node.parent&.name)
node.unlink if node.content.empty?
end
end
# Split text nodes when they have newlines:
def split_text_at_newlines(root)
traverse(root) do |node|
next unless node.text?
pieces = node.to_s.split(/([[:space:]]*\n[[:space:]]*)/)
next unless pieces.length > 1
pieces.each do |piece|
next if piece.empty?
case piece.count("\n")
when 0
# No newlines, it's just a normal piece of text:
node.add_previous_sibling(piece)
when 1
# One newline, replace it with a <br>.
node.add_previous_sibling("<br>\n")
when 2
# Two newlines, replace it with a special tag (to be processed later)
# that marks the paragraph to be split at this location.
node.add_previous_sibling("<split>")
else
# Three or more newlines, replace it with a special tag (to be
# processed later) that marks the paragraph to be split at this
# location, with a <p>&nbsp;</p> paragraph inserted to increase the
# spacing between lines.
node.add_previous_sibling("<split long>")
end
end
node.unlink
end
end
# Traverse the tree in search of sequences of <br> tags. We want to find the
# last <br> tag in the sequence, so that we don't process the sequence more
# than once. Once we've found the <br> tag that has a prior <br> tag, but no
# following <br> tag, we can yank out all of the <br> tags in the sequence,
# counting them as we go.
def merge_br_tags(root)
traverse(root) do |node|
next unless node.name == "br" &&
node.next_sibling&.name != "br" &&
node.previous_sibling&.name == "br"
break_count = 1
while node.previous_sibling&.name == "br"
node.previous_sibling.unlink
break_count += 1
end
if break_count == 2
# Replace the <br> sequence with a special tag (to be processed later)
# that indicates that whatever paragraph we're in needs to be split at
# this point. This corresponds to having two <br> tags in a row.
node.replace("<split>")
else
# Replace the <br> sequence with a special tag (to be processed later)
# that indicates that whatever paragraph we're in needs to be split at
# this point, and there needs to be a <p>&nbsp;</p> paragraph inserted
# into the split. This corresponds to having 3+ <br> tags in a row.
node.replace("<split long>")
end
end
end
# Go through all nodes that should be inside paragraph tags, and make sure
# they're wrapped:
def wrap_all(root)
# We don't use traverse(root) here to handle the traversal because we're
# doing something much more complex than usual with our children.
return if TAG_NAMES_TO_SKIP.include?(root.name) || in_paragraph?(root) || root.name == "p"
# Divide up the list of children into sublists based on whether the node
# needs to have a paragraph wrapped around it, or not.
chunks = root.children.chunk do |node|
node.text? || node.cdata? || TAG_NAMES_TO_WRAP.include?(node.name)
end
chunks.each do |should_wrap, children|
if should_wrap
# Create a paragraph object, and then transfer all of the children in
# this chunk to it.
paragraph = root.document.create_element("p")
children.first.add_previous_sibling(paragraph)
children.each do |node|
paragraph.add_child(node)
end
else
# These nodes don't need to be wrapped in a paragraph node, but some of
# their children might, so we have to process them.
children.each do |node|
wrap_all(node)
end
end
end
end
# Given a node, "split" its parent at that node and move the node upwards one
# level -- that is, split its siblings by whether they're left/right of the
# node, and split the parent into two copies, one containing the left siblings
# and one containing the right siblings.
#
# For example, if we have something like this:
# <ul class="actions">
# <li>First</li>
# <li>Second</li>
# <li>Third</li>
# </ul>
# Then calling split_parent on the second list item will produce this:
# <ul class="actions">
# <li>First</li>
# </ul>
# <li>Second</li>
# <ul class="actions">
# <li>Third</li>
# </ul>
def split_parent(node)
# The easy cases: if this node is the first or last node in its parent, then
# there's no point in a proper split, because one of the two tags would just
# be empty. Instead, we rearrange ourselves relative to our parent.
return node.parent.add_previous_sibling(node) if node.previous_sibling.nil?
return node.parent.add_next_sibling(node) if node.next_sibling.nil?
# Create a shallow copy of the parent:
new_parent = node.parent.dup(0)
# Copy over all of the attributes:
node.parent.attributes.each do |name, attribute|
new_parent.set_attribute(name, attribute.value)
end
# Put the new parent in the right place:
node.parent.add_next_sibling(new_parent)
# Move over all of the siblings:
new_parent.add_child(node.next_sibling) until node.next_sibling.nil?
# Put the node in the right place:
node.parent.add_next_sibling(node)
end
# Remove the given node from its containing paragraph tag, if it has one.
def extract_from_paragraph(node)
split_parent(node) while in_paragraph?(node)
end
# Go through all nodes that shouldn't be inside paragraph tags, and remove them
# from paragraph tags.
def unwrap_all(root)
traverse(root) do |node|
extract_from_paragraph(node) if TAG_NAMES_TO_UNWRAP.include?(node.name)
end
end
# Process the <split> tags inserted by split_text_at_newlines and
# merge_br_tags. Either way, we use extract_from_paragraph to split its
# parents until we reach the paragraph tag that contains it. If the split is
# marked as "long," we need to insert a <p>&nbsp;</p> blank paragraph.
# Reintroduce newlines instead to preserve chunks in separate lines.
def replace_splits(root)
root.css("split").each do |node|
extract_from_paragraph(node)
if node.attribute("long")
node.replace("\n<p>&nbsp;</p>\n")
else
node.replace("\n")
end
end
end
# Traverse the tree and delete any paragraph nodes that have no children.
def delete_empty_paragraphs(root)
traverse(root) do |node|
node.unlink if node.name == "p" && node.children.empty?
end
end
public
# Process the given node to add paragraphs to it and all of its children.
def process(root)
strip_whitespace(root)
split_text_at_newlines(root)
merge_br_tags(root)
wrap_all(root)
unwrap_all(root)
replace_splits(root)
delete_empty_paragraphs(root)
end
end