otwarchive-symphonyarchive/lib/word_counter.rb
2026-03-11 22:22:11 +00:00

37 lines
1.2 KiB
Ruby
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# encoding=utf-8
require 'nokogiri'
class WordCounter
attr_accessor :text
def initialize(text)
@text = text
end
# only count actual text
# scan by word boundaries after stripping hyphens and apostrophes
# so one-word and one's will be counted as one word, not two.
# -- is replaced by — (emdash) before strip so one--two will count as 2
def count
count = 0
# avoid blank? so we don't need to load Rails for tests
return count if @text.nil? || @text.empty?
# Scripts such as Chinese and Japanese that do not have space between words
# are counted based on the number of characters. If a text include mixed
# languages, only characters in these languages would be counted as words,
# words in other languages are counted as usual
character_count_scripts = ArchiveConfig.CHARACTER_COUNT_SCRIPTS.map { |lang| "\\p{#{lang}}" }.join("|")
body = Nokogiri::HTML5.parse(@text).xpath("//body").first
body.traverse do |node|
if node.text?
count += node.inner_text.gsub(/--/, "").gsub(/['-]/, "")
.scan(/#{character_count_scripts}|((?!#{character_count_scripts})[[:word:]])+/).size
end
end
count
end
end