otwarchive-symphonyarchive/app/models/search/indexer.rb

229 lines
4.9 KiB
Ruby
Raw Permalink Normal View History

2026-03-11 22:22:11 +00:00
class Indexer
BATCH_SIZE = 1000
INDEXERS_FOR_CLASS = {
Work: %w[WorkIndexer WorkCreatorIndexer BookmarkedWorkIndexer],
Bookmark: %w[BookmarkIndexer],
Tag: %w[TagIndexer],
Pseud: %w[PseudIndexer],
Series: %w[BookmarkedSeriesIndexer],
ExternalWork: %w[BookmarkedExternalWorkIndexer],
User: %w[UserIndexer]
}.freeze
delegate :klass, :klass_with_includes, :index_name, :document_type, to: :class
##################
# CLASS METHODS
##################
def self.klass
raise "Must be defined in subclass"
end
def self.klass_with_includes
klass.constantize
end
def self.all
[
BookmarkedExternalWorkIndexer,
BookmarkedSeriesIndexer,
BookmarkedWorkIndexer,
BookmarkIndexer,
PseudIndexer,
TagIndexer,
UserIndexer,
WorkIndexer,
WorkCreatorIndexer
]
end
# Originally added to allow IndexSweeper to find the Elasticsearch document
# ids when they do not match the associated ActiveRecord objects' ids.
#
# Override in subclasses if necessary.
def self.find_elasticsearch_ids(ids)
ids
end
def self.delete_index
if $elasticsearch.indices.exists(index: index_name)
$elasticsearch.indices.delete(index: index_name)
end
end
def self.create_index(shards: 5)
$elasticsearch.indices.create(
index: index_name,
body: {
settings: {
index: {
# static settings
number_of_shards: shards,
# dynamic settings
max_result_window: ArchiveConfig.MAX_SEARCH_RESULTS,
}
}.merge(settings),
mappings: mapping,
}
)
end
def self.prepare_for_testing
raise "Wrong environment for test prep!" unless Rails.env.test?
delete_index
# Relevance sorting is unpredictable with multiple shards
# and small amounts of data
create_index(shards: 1)
end
def self.refresh_index
# Refreshes are resource-intensive, we use them only in tests.
raise "Wrong environment for index refreshes!" unless Rails.env.test?
return unless $elasticsearch.indices.exists(index: index_name)
$elasticsearch.indices.refresh(index: index_name)
end
# Note that the index must exist before you can set the mapping
def self.create_mapping
$elasticsearch.indices.put_mapping(
index: index_name,
body: mapping
)
end
def self.mapping
{
properties: {
# add properties in subclasses
}
}
end
def self.settings
{
analyzer: {
custom_analyzer: {
# add properties in subclasses
}
}
}
end
def self.index_all(options = {})
unless options[:skip_delete]
delete_index
create_index
end
index_from_db
end
def self.index_from_db
total = (indexables.count / BATCH_SIZE) + 1
i = 1
indexables.find_in_batches(batch_size: BATCH_SIZE) do |group|
puts "Queueing #{klass} batch #{i} of #{total}" unless Rails.env.test?
AsyncIndexer.new(self, :world).enqueue_ids(group.map(&:id))
i += 1
end
end
# Add conditions here
def self.indexables
klass.constantize
end
def self.index_name
"#{ArchiveConfig.ELASTICSEARCH_PREFIX}_#{Rails.env}_#{klass.underscore.pluralize}"
end
def self.document_type
klass.underscore
end
# Given a searchable object, what indexers should handle it?
# Returns an array of indexers
def self.for_object(object)
name = object.is_a?(Tag) ? 'Tag' : object.class.to_s
(INDEXERS_FOR_CLASS[name.to_sym] || []).map(&:constantize)
end
# Should be called after a batch update, with the IDs that were successfully
# updated. Calls successful_reindex on the indexable class.
def self.handle_success(ids)
if indexables.respond_to?(:successful_reindex)
indexables.successful_reindex(ids)
end
end
####################
# INSTANCE METHODS
####################
attr_reader :ids
def initialize(ids)
@ids = ids
end
def objects
@objects ||= klass_with_includes.where(id: ids).inject({}) do |h, obj|
h.merge(obj.id => obj)
end
end
def batch
return @batch if @batch
@batch = []
ids.each do |id|
object = objects[id.to_i]
if object.present?
@batch << { index: routing_info(id) }
@batch << document(object)
else
@batch << { delete: routing_info(id) }
end
end
@batch
end
def index_documents
return if batch.empty?
$elasticsearch.bulk(body: batch)
end
def index_document(object)
info = {
index: index_name,
id: document_id(object.id),
body: document(object)
}
if respond_to?(:parent_id)
info.merge!(routing: parent_id(object.id, object))
end
$elasticsearch.index(info)
end
def routing_info(id)
{
"_index" => index_name,
"_id" => id
}
end
def document(object)
object.as_json(root: false)
end
# can be overriden by our bookmarkable indexers
def document_id(id)
id
end
end