otwarchive-symphonyarchive/app/models/search/indexer.rb

class Indexer
  BATCH_SIZE = 1000
  INDEXERS_FOR_CLASS = {
    Work: %w[WorkIndexer WorkCreatorIndexer BookmarkedWorkIndexer],
    Bookmark: %w[BookmarkIndexer],
    Tag: %w[TagIndexer],
    Pseud: %w[PseudIndexer],
    Series: %w[BookmarkedSeriesIndexer],
    ExternalWork: %w[BookmarkedExternalWorkIndexer],
    User: %w[UserIndexer]
  }.freeze

  delegate :klass, :klass_with_includes, :index_name, :document_type, to: :class

  ##################
  # CLASS METHODS
  ##################

  def self.klass
    raise "Must be defined in subclass"
  end

  def self.klass_with_includes
    klass.constantize
  end

  def self.all
    [
      BookmarkedExternalWorkIndexer,
      BookmarkedSeriesIndexer,
      BookmarkedWorkIndexer,
      BookmarkIndexer,
      PseudIndexer,
      TagIndexer,
      UserIndexer,
      WorkIndexer,
      WorkCreatorIndexer
    ]
  end

  # Originally added to allow IndexSweeper to find the Elasticsearch document
  # ids when they do not match the associated ActiveRecord objects' ids.
  #
  # Override in subclasses if necessary.
  def self.find_elasticsearch_ids(ids)
    ids
  end

  def self.delete_index
    if $elasticsearch.indices.exists(index: index_name)
      $elasticsearch.indices.delete(index: index_name)
    end
  end

  def self.create_index(shards: 5)
    $elasticsearch.indices.create(
      index: index_name,
      body: {
        settings: {
          index: {
            # static settings
            number_of_shards: shards,
            # dynamic settings
            max_result_window: ArchiveConfig.MAX_SEARCH_RESULTS,
          }
        }.merge(settings),
        mappings: mapping,
      }
    )
  end

  def self.prepare_for_testing
    raise "Wrong environment for test prep!" unless Rails.env.test?

    delete_index
    # Relevance sorting is unpredictable with multiple shards
    # and small amounts of data
    create_index(shards: 1)
  end

  def self.refresh_index
    # Refreshes are resource-intensive, we use them only in tests.
    raise "Wrong environment for index refreshes!" unless Rails.env.test?

    return unless $elasticsearch.indices.exists(index: index_name)

    $elasticsearch.indices.refresh(index: index_name)
  end

  # Note that the index must exist before you can set the mapping
  def self.create_mapping
    $elasticsearch.indices.put_mapping(
      index: index_name,
      body: mapping
    )
  end

  def self.mapping
    {
      properties: {
        # add properties in subclasses
      }
    }
  end

  def self.settings
    {
      analyzer: {
        custom_analyzer: {
          # add properties in subclasses
        }
      }
    }
  end

  def self.index_all(options = {})
    unless options[:skip_delete]
      delete_index
      create_index
    end
    index_from_db
  end

  def self.index_from_db
    total = (indexables.count / BATCH_SIZE) + 1
    i = 1
    indexables.find_in_batches(batch_size: BATCH_SIZE) do |group|
      puts "Queueing #{klass} batch #{i} of #{total}" unless Rails.env.test?
      AsyncIndexer.new(self, :world).enqueue_ids(group.map(&:id))
      i += 1
    end
  end

  # Add conditions here
  def self.indexables
    klass.constantize
  end

  def self.index_name
    "#{ArchiveConfig.ELASTICSEARCH_PREFIX}_#{Rails.env}_#{klass.underscore.pluralize}"
  end

  def self.document_type
    klass.underscore
  end

  # Given a searchable object, what indexers should handle it?
  # Returns an array of indexers
  def self.for_object(object)
    name = object.is_a?(Tag) ? 'Tag' : object.class.to_s
    (INDEXERS_FOR_CLASS[name.to_sym] || []).map(&:constantize)
  end

  # Should be called after a batch update, with the IDs that were successfully
  # updated. Calls successful_reindex on the indexable class.
  def self.handle_success(ids)
    if indexables.respond_to?(:successful_reindex)
      indexables.successful_reindex(ids)
    end
  end

  ####################
  # INSTANCE METHODS
  ####################

  attr_reader :ids

  def initialize(ids)
    @ids = ids
  end

  def objects
    @objects ||= klass_with_includes.where(id: ids).inject({}) do |h, obj|
      h.merge(obj.id => obj)
    end
  end

  def batch
    return @batch if @batch

    @batch = []
    ids.each do |id|
      object = objects[id.to_i]
      if object.present?
        @batch << { index: routing_info(id) }
        @batch << document(object)
      else
        @batch << { delete: routing_info(id) }
      end
    end
    @batch
  end

  def index_documents
    return if batch.empty?

    $elasticsearch.bulk(body: batch)
  end

  def index_document(object)
    info = {
      index: index_name,
      id: document_id(object.id),
      body: document(object)
    }
    if respond_to?(:parent_id)
      info.merge!(routing: parent_id(object.id, object))
    end
    $elasticsearch.index(info)
  end

  def routing_info(id)
    {
      "_index" => index_name,
      "_id" => id
    }
  end

  def document(object)
    object.as_json(root: false)
  end

  # can be overriden by our bookmarkable indexers
  def document_id(id)
    id
  end

end
first 2026-03-11 22:22:11 +00:00			`class Indexer`
			`BATCH_SIZE = 1000`
			`INDEXERS_FOR_CLASS = {`
			`Work: %w[WorkIndexer WorkCreatorIndexer BookmarkedWorkIndexer],`
			`Bookmark: %w[BookmarkIndexer],`
			`Tag: %w[TagIndexer],`
			`Pseud: %w[PseudIndexer],`
			`Series: %w[BookmarkedSeriesIndexer],`
			`ExternalWork: %w[BookmarkedExternalWorkIndexer],`
			`User: %w[UserIndexer]`
			`}.freeze`

			`delegate :klass, :klass_with_includes, :index_name, :document_type, to: :class`

			`##################`
			`# CLASS METHODS`
			`##################`

			`def self.klass`
			`raise "Must be defined in subclass"`
			`end`

			`def self.klass_with_includes`
			`klass.constantize`
			`end`

			`def self.all`
			`[`
			`BookmarkedExternalWorkIndexer,`
			`BookmarkedSeriesIndexer,`
			`BookmarkedWorkIndexer,`
			`BookmarkIndexer,`
			`PseudIndexer,`
			`TagIndexer,`
			`UserIndexer,`
			`WorkIndexer,`
			`WorkCreatorIndexer`
			`]`
			`end`

			`# Originally added to allow IndexSweeper to find the Elasticsearch document`
			`# ids when they do not match the associated ActiveRecord objects' ids.`
			`#`
			`# Override in subclasses if necessary.`
			`def self.find_elasticsearch_ids(ids)`
			`ids`
			`end`

			`def self.delete_index`
			`if $elasticsearch.indices.exists(index: index_name)`
			`$elasticsearch.indices.delete(index: index_name)`
			`end`
			`end`

			`def self.create_index(shards: 5)`
			`$elasticsearch.indices.create(`
			`index: index_name,`
			`body: {`
			`settings: {`
			`index: {`
			`# static settings`
			`number_of_shards: shards,`
			`# dynamic settings`
			`max_result_window: ArchiveConfig.MAX_SEARCH_RESULTS,`
			`}`
			`}.merge(settings),`
			`mappings: mapping,`
			`}`
			`)`
			`end`

			`def self.prepare_for_testing`
			`raise "Wrong environment for test prep!" unless Rails.env.test?`

			`delete_index`
			`# Relevance sorting is unpredictable with multiple shards`
			`# and small amounts of data`
			`create_index(shards: 1)`
			`end`

			`def self.refresh_index`
			`# Refreshes are resource-intensive, we use them only in tests.`
			`raise "Wrong environment for index refreshes!" unless Rails.env.test?`

			`return unless $elasticsearch.indices.exists(index: index_name)`

			`$elasticsearch.indices.refresh(index: index_name)`
			`end`

			`# Note that the index must exist before you can set the mapping`
			`def self.create_mapping`
			`$elasticsearch.indices.put_mapping(`
			`index: index_name,`
			`body: mapping`
			`)`
			`end`

			`def self.mapping`
			`{`
			`properties: {`
			`# add properties in subclasses`
			`}`
			`}`
			`end`

			`def self.settings`
			`{`
			`analyzer: {`
			`custom_analyzer: {`
			`# add properties in subclasses`
			`}`
			`}`
			`}`
			`end`

			`def self.index_all(options = {})`
			`unless options[:skip_delete]`
			`delete_index`
			`create_index`
			`end`
			`index_from_db`
			`end`

			`def self.index_from_db`
			`total = (indexables.count / BATCH_SIZE) + 1`
			`i = 1`
			`indexables.find_in_batches(batch_size: BATCH_SIZE) do \|group\|`
			`puts "Queueing #{klass} batch #{i} of #{total}" unless Rails.env.test?`
			`AsyncIndexer.new(self, :world).enqueue_ids(group.map(&:id))`
			`i += 1`
			`end`
			`end`

			`# Add conditions here`
			`def self.indexables`
			`klass.constantize`
			`end`

			`def self.index_name`
			`"#{ArchiveConfig.ELASTICSEARCH_PREFIX}_#{Rails.env}_#{klass.underscore.pluralize}"`
			`end`

			`def self.document_type`
			`klass.underscore`
			`end`

			`# Given a searchable object, what indexers should handle it?`
			`# Returns an array of indexers`
			`def self.for_object(object)`
			`name = object.is_a?(Tag) ? 'Tag' : object.class.to_s`
			`(INDEXERS_FOR_CLASS[name.to_sym] \|\| []).map(&:constantize)`
			`end`

			`# Should be called after a batch update, with the IDs that were successfully`
			`# updated. Calls successful_reindex on the indexable class.`
			`def self.handle_success(ids)`
			`if indexables.respond_to?(:successful_reindex)`
			`indexables.successful_reindex(ids)`
			`end`
			`end`

			`####################`
			`# INSTANCE METHODS`
			`####################`

			`attr_reader :ids`

			`def initialize(ids)`
			`@ids = ids`
			`end`

			`def objects`
			`@objects \|\|= klass_with_includes.where(id: ids).inject({}) do \|h, obj\|`
			`h.merge(obj.id => obj)`
			`end`
			`end`

			`def batch`
			`return @batch if @batch`

			`@batch = []`
			`ids.each do \|id\|`
			`object = objects[id.to_i]`
			`if object.present?`
			`@batch << { index: routing_info(id) }`
			`@batch << document(object)`
			`else`
			`@batch << { delete: routing_info(id) }`
			`end`
			`end`
			`@batch`
			`end`

			`def index_documents`
			`return if batch.empty?`

			`$elasticsearch.bulk(body: batch)`
			`end`

			`def index_document(object)`
			`info = {`
			`index: index_name,`
			`id: document_id(object.id),`
			`body: document(object)`
			`}`
			`if respond_to?(:parent_id)`
			`info.merge!(routing: parent_id(object.id, object))`
			`end`
			`$elasticsearch.index(info)`
			`end`

			`def routing_info(id)`
			`{`
			`"_index" => index_name,`
			`"_id" => id`
			`}`
			`end`

			`def document(object)`
			`object.as_json(root: false)`
			`end`

			`# can be overriden by our bookmarkable indexers`
			`def document_id(id)`
			`id`
			`end`

			`end`