37 lines
1.2 KiB
Ruby
37 lines
1.2 KiB
Ruby
# encoding=utf-8
|
||
|
||
require 'nokogiri'
|
||
|
||
class WordCounter
|
||
|
||
attr_accessor :text
|
||
|
||
def initialize(text)
|
||
@text = text
|
||
end
|
||
|
||
# only count actual text
|
||
# scan by word boundaries after stripping hyphens and apostrophes
|
||
# so one-word and one's will be counted as one word, not two.
|
||
# -- is replaced by — (emdash) before strip so one--two will count as 2
|
||
def count
|
||
count = 0
|
||
# avoid blank? so we don't need to load Rails for tests
|
||
return count if @text.nil? || @text.empty?
|
||
|
||
# Scripts such as Chinese and Japanese that do not have space between words
|
||
# are counted based on the number of characters. If a text include mixed
|
||
# languages, only characters in these languages would be counted as words,
|
||
# words in other languages are counted as usual
|
||
character_count_scripts = ArchiveConfig.CHARACTER_COUNT_SCRIPTS.map { |lang| "\\p{#{lang}}" }.join("|")
|
||
body = Nokogiri::HTML5.parse(@text).xpath("//body").first
|
||
body.traverse do |node|
|
||
if node.text?
|
||
count += node.inner_text.gsub(/--/, "—").gsub(/['’‘-]/, "")
|
||
.scan(/#{character_count_scripts}|((?!#{character_count_scripts})[[:word:]])+/).size
|
||
end
|
||
end
|
||
count
|
||
end
|
||
|
||
end
|