otwarchive-symphonyarchive/lib/word_counter.rb

# encoding=utf-8

require 'nokogiri'

class WordCounter

  attr_accessor :text

  def initialize(text)
    @text = text
  end

  # only count actual text
  # scan by word boundaries after stripping hyphens and apostrophes
  # so one-word and one's will be counted as one word, not two.
  # -- is replaced by — (emdash) before strip so one--two will count as 2
  def count
    count = 0
    # avoid blank? so we don't need to load Rails for tests
    return count if @text.nil? || @text.empty?

    # Scripts such as Chinese and Japanese that do not have space between words
    # are counted based on the number of characters. If a text include mixed
    # languages, only characters in these languages would be counted as words,
    # words in other languages are counted as usual
    character_count_scripts = ArchiveConfig.CHARACTER_COUNT_SCRIPTS.map { |lang| "\\p{#{lang}}" }.join("|")
    body = Nokogiri::HTML5.parse(@text).xpath("//body").first
    body.traverse do |node|
      if node.text?
        count += node.inner_text.gsub(/--/, "—").gsub(/['’‘-]/, "")
          .scan(/#{character_count_scripts}|((?!#{character_count_scripts})[[:word:]])+/).size
      end
    end
    count
  end

end