Skip to content

FEATURE: Translate categories and tags #269

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 37 additions & 18 deletions app/jobs/scheduled/automatic_translation_backfill.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ def fetch_untranslated_model_ids(model, content_column, limit, target_locale)
SELECT m.id
FROM #{model.table_name} m
#{limit_to_public_clause(model)}
WHERE m.deleted_at IS NULL
AND m.#{content_column} != ''
AND m.user_id > 0
#{max_age_clause}
WHERE m.#{content_column} != ''
#{not_deleted_clause(model)}
Copy link
Contributor Author

@nattsw nattsw Apr 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unlike posts and topics, categories and tags do not have a deleted_at nor can be created by a bot

#{non_bot_clause(model)}
#{max_age_clause(model)}
Comment on lines +28 to +30
Copy link
Contributor

@tgxworld tgxworld Apr 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering if it makes sense to just make this backfill_clause(model) or something instead of having the logic of model == Post || model == Topic present over 3 methods. This also makes the complete query much easier to read/understand IMO instead of having 3 parts to combine mentally.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually the current query looks very neat instead of having to repeat the clause for every model

            SELECT m.id
            FROM #{model.table_name} m
            #{limit_to_public_clause(model)}
            WHERE m.#{content_column} != ''
              #{not_deleted_clause(model)}
              #{non_bot_clause(model)}
              #{max_age_clause(model)}
            ORDER BY m.updated_at DESC

I will prefer optimizing next time if it gets more complex.

ORDER BY m.updated_at DESC
)
EXCEPT
Expand Down Expand Up @@ -87,26 +87,35 @@ def translate_records(type, record_ids, target_locale)

def process_batch
records_to_translate = SiteSetting.automatic_translation_backfill_rate
backfill_locales.each_with_index do |target_locale, i|
topic_ids =
fetch_untranslated_model_ids(Topic, "title", records_to_translate, target_locale)
post_ids = fetch_untranslated_model_ids(Post, "raw", records_to_translate, target_locale)

next if topic_ids.empty? && post_ids.empty?

DiscourseTranslator::VerboseLogger.log(
"Translating #{topic_ids.size} topics and #{post_ids.size} posts to #{target_locale}",
)
backfill_locales.each do |target_locale|
[
[Topic, "title"],
[Post, "raw"],
[Category, "name"],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is Category#description too. Is that something we want to consider in this PR too?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not yet. Category descriptions are currently topics so we will ignore for now.

[Tag, "name"],
].each do |model, content_column|
ids =
fetch_untranslated_model_ids(model, content_column, records_to_translate, target_locale)

next if ids.empty?

DiscourseTranslator::VerboseLogger.log(
"Translating #{ids.size} #{model.name} to #{target_locale}",
)

translate_records(Topic, topic_ids, target_locale)
translate_records(Post, post_ids, target_locale)
translate_records(model, ids, target_locale)
end
end
end

def max_age_clause
def max_age_clause(model)
return "" if SiteSetting.automatic_translation_backfill_max_age_days <= 0

"AND m.created_at > NOW() - INTERVAL '#{SiteSetting.automatic_translation_backfill_max_age_days} days'"
if model == Post || model == Topic
"AND m.created_at > NOW() - INTERVAL '#{SiteSetting.automatic_translation_backfill_max_age_days} days'"
else
""
end
end

def limit_to_public_clause(model)
Expand All @@ -130,5 +139,15 @@ def limit_to_public_clause(model)

limit_to_public_clause
end

def non_bot_clause(model)
return "AND m.user_id > 0" if model == Post || model == Topic
""
end

def not_deleted_clause(model)
return "AND m.deleted_at IS NULL" if model == Post || model == Topic
""
end
end
end
23 changes: 23 additions & 0 deletions app/models/discourse_translator/category_locale.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# frozen_string_literal: true

module DiscourseTranslator
class CategoryLocale < ActiveRecord::Base
self.table_name = "discourse_translator_category_locales"

belongs_to :category

validates :category_id, presence: true
validates :detected_locale, presence: true
end
end

# == Schema Information
#
# Table name: discourse_translator_category_locales
#
# id :bigint not null, primary key
# category_id :integer not null
# detected_locale :string(20) not null
# created_at :datetime not null
# updated_at :datetime not null
#
30 changes: 30 additions & 0 deletions app/models/discourse_translator/category_translation.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# frozen_string_literal: true

module DiscourseTranslator
class CategoryTranslation < ActiveRecord::Base
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to differentiate between Category#name and Category#description? It might be useful to think about how we can expand this table to store the translation for more than a single column since some models may have multiple columns that requires us to translate.

Copy link
Contributor Author

@nattsw nattsw Apr 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No.

This current implementation makes use of the translatable concern. Category descriptions point to a topic so it will use topic translations instead.

self.table_name = "discourse_translator_category_translations"

belongs_to :category

validates :category_id, presence: true
validates :locale, presence: true
validates :translation, presence: true
validates :locale, uniqueness: { scope: :category_id }
end
end

# == Schema Information
#
# Table name: discourse_translator_category_translations
#
# id :bigint not null, primary key
# category_id :integer not null
# locale :string not null
# translation :text not null
# created_at :datetime not null
# updated_at :datetime not null
#
# Indexes
#
# idx_category_translations_on_category_id_and_locale (category_id,locale) UNIQUE
#
23 changes: 23 additions & 0 deletions app/models/discourse_translator/tag_locale.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# frozen_string_literal: true

module DiscourseTranslator
class TagLocale < ActiveRecord::Base
self.table_name = "discourse_translator_tag_locales"

belongs_to :tag

validates :tag_id, presence: true
validates :detected_locale, presence: true
end
end

# == Schema Information
#
# Table name: discourse_translator_tag_locales
#
# id :bigint not null, primary key
# tag_id :integer not null
# detected_locale :string(20) not null
# created_at :datetime not null
# updated_at :datetime not null
#
30 changes: 30 additions & 0 deletions app/models/discourse_translator/tag_translation.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# frozen_string_literal: true

module DiscourseTranslator
class TagTranslation < ActiveRecord::Base
self.table_name = "discourse_translator_tag_translations"

belongs_to :tag

validates :tag_id, presence: true
validates :locale, presence: true
validates :translation, presence: true
validates :locale, uniqueness: { scope: :tag_id }
end
end

# == Schema Information
#
# Table name: discourse_translator_tag_translations
#
# id :bigint not null, primary key
# tag_id :integer not null
# locale :string not null
# translation :text not null
# created_at :datetime not null
# updated_at :datetime not null
#
# Indexes
#
# idx_tag_translations_on_tag_id_and_locale (tag_id,locale) UNIQUE
#
37 changes: 37 additions & 0 deletions app/services/discourse_ai/category_translator.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# frozen_string_literal: true

module DiscourseAi
class CategoryTranslator < BaseTranslator
PROMPT_TEMPLATE = <<~TEXT.freeze
You are a translation service specializing in translating forum category names to the asked target_language. Your task is to provide accurate and contextually appropriate translations while adhering to the following guidelines:

1. Translate the category name to target_language asked
2. Keep proper nouns and technical terms in their original language
3. Keep the translated category name length short, and close to the original length
4. Ensure the translation maintains the original meaning

Provide your translation in the following JSON format:

<output>
{"translation": "Your target_language translation here"}
</output>

Here are three examples of correct translation

Original: {"name":"Cats and Dogs", "target_language":"Chinese"}
Correct translation: {"translation": "猫和狗"}

Original: {"name":"General", "target_language":"French"}
Correct translation: {"translation": "Général"}

Original: {"name": "Q&A", "target_language": "Portuguese"}
Correct translation: {"translation": "Perguntas e Respostas"}

Remember to keep proper nouns like "Minecraft" and "Toyota" in their original form. Translate the category name now and provide your answer in the specified JSON format.
TEXT

private def prompt_template
PROMPT_TEMPLATE
end
end
end
38 changes: 38 additions & 0 deletions app/services/discourse_ai/tag_translator.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# frozen_string_literal: true

module DiscourseAi
class TagTranslator < BaseTranslator
PROMPT_TEMPLATE = <<~TEXT.freeze
You are a translation service specializing in translating forum tags to the asked target_language. Your task is to provide accurate and contextually appropriate translations while adhering to the following guidelines:

1. Translate the tags to target_language asked
2. Keep proper nouns and technical terms in their original language
3. Keep the translated tags short, close to the original length
4. Ensure the translation maintains the original meaning
4. Translated tags will be in lowercase

Provide your translation in the following JSON format:

<output>
{"translation": "your target_language translation here"}
</output>

Here are three examples of correct translation

Original: {"name":"solved", "target_language":"Chinese"}
Correct translation: {"translation": "已解决"}

Original: {"name":"General", "target_language":"French"}
Correct translation: {"translation": "général"}

Original: {"name": "Q&A", "target_language": "Portuguese"}
Correct translation: {"translation": "perguntas e respostas"}

Remember to keep proper nouns like "minecraft" and "toyota" in their original form. Translate the tag now and provide your answer in the specified JSON format.
TEXT

private def prompt_template
PROMPT_TEMPLATE
end
end
end
4 changes: 2 additions & 2 deletions app/services/discourse_ai/topic_translator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
module DiscourseAi
class TopicTranslator < BaseTranslator
PROMPT_TEMPLATE = <<~TEXT.freeze
You are a translation service specializing in translating forum post titles from English to the asked target_language. Your task is to provide accurate and contextually appropriate translations while adhering to the following guidelines:
You are a translation service specializing in translating forum post titles to the asked target_language. Your task is to provide accurate and contextually appropriate translations while adhering to the following guidelines:

1. Translate the given title from English to target_language asked.
1. Translate the given title to target_language asked.
2. Keep proper nouns and technical terms in their original language.
3. Attempt to keep the translated title length close to the original when possible.
4. Ensure the translation maintains the original meaning and tone.
Expand Down
4 changes: 4 additions & 0 deletions app/services/discourse_translator/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,10 @@ def self.get_untranslated(translatable, raw: false)
raw ? translatable.raw : translatable.cooked
when "Topic"
translatable.title
when "Category"
translatable.name
when "Tag"
translatable.name
end
end
end
Expand Down
7 changes: 7 additions & 0 deletions app/services/discourse_translator/discourse_ai.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ def self.translate!(translatable, target_locale_sym = I18n.locale)
.join("")
when "Topic"
::DiscourseAi::TopicTranslator.new(text_for_translation(translatable), language).translate
when "Category"
::DiscourseAi::CategoryTranslator.new(
text_for_translation(translatable),
language,
).translate
when "Tag"
::DiscourseAi::TagTranslator.new(text_for_translation(translatable), language).translate
end

DiscourseTranslator::TranslatedContentNormalizer.normalize(translatable, translated)
Expand Down
23 changes: 23 additions & 0 deletions db/migrate/20250401015139_create_category_translation_table.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# frozen_string_literal: true

class CreateCategoryTranslationTable < ActiveRecord::Migration[7.2]
def change
create_table :discourse_translator_category_locales do |t|
t.integer :category_id, null: false
t.string :detected_locale, limit: 20, null: false
t.timestamps
end

create_table :discourse_translator_category_translations do |t|
t.integer :category_id, null: false
t.string :locale, null: false
t.text :translation, null: false
t.timestamps
end

add_index :discourse_translator_category_translations,
%i[category_id locale],
unique: true,
name: "idx_category_translations_on_category_id_and_locale"
end
end
23 changes: 23 additions & 0 deletions db/migrate/20250401022618_create_tag_translation_table.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# frozen_string_literal: true

class CreateTagTranslationTable < ActiveRecord::Migration[7.2]
def change
create_table :discourse_translator_tag_locales do |t|
t.integer :tag_id, null: false
t.string :detected_locale, limit: 20, null: false
t.timestamps
end

create_table :discourse_translator_tag_translations do |t|
t.integer :tag_id, null: false
t.string :locale, null: false
t.text :translation, null: false
t.timestamps
end

add_index :discourse_translator_tag_translations,
%i[tag_id locale],
unique: true,
name: "idx_tag_translations_on_tag_id_and_locale"
end
end
11 changes: 11 additions & 0 deletions lib/discourse_translator/extensions/category_extension.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# frozen_string_literal: true

module DiscourseTranslator
module Extensions
module CategoryExtension
extend ActiveSupport::Concern
prepended { before_update :clear_translations, if: :name_changed? }
include Translatable
end
end
end
11 changes: 11 additions & 0 deletions lib/discourse_translator/extensions/tag_extension.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# frozen_string_literal: true

module DiscourseTranslator
module Extensions
module TagExtension
extend ActiveSupport::Concern
prepended { before_update :clear_translations, if: :name_changed? }
include Translatable
end
end
end
Loading
Loading