Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit fae2d5f

Browse files
authored
FEATURE: link correctly to filters to assist in debugging spam (#1031)
- Add spam_score_type to AiSpamSerializer for better integration with reviewables. - Introduce a custom filter for detecting AI spam false negatives in moderation workflows. - Refactor spam report generation to improve identification of false negatives. - Add tests to verify the custom filter and its behavior. - Introduce links for all spam counts in report
1 parent 90ce942 commit fae2d5f

File tree

5 files changed

+118
-28
lines changed

5 files changed

+118
-28
lines changed

app/serializers/ai_spam_serializer.rb

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,13 @@
11
# frozen_string_literal: true
22

33
class AiSpamSerializer < ApplicationSerializer
4-
attributes :is_enabled, :llm_id, :custom_instructions, :available_llms, :stats, :flagging_username
4+
attributes :is_enabled,
5+
:llm_id,
6+
:custom_instructions,
7+
:available_llms,
8+
:stats,
9+
:flagging_username,
10+
:spam_score_type
511

612
def is_enabled
713
object[:enabled]
@@ -25,6 +31,10 @@ def flagging_username
2531
object[:flagging_username]
2632
end
2733

34+
def spam_score_type
35+
ReviewableScore.types[:spam]
36+
end
37+
2838
def stats
2939
{
3040
scanned_count: object[:stats].scanned_count.to_i,

assets/javascripts/discourse/components/ai-spam.gjs

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,30 @@ export default class AiSpam extends Component {
125125
label: i18n("discourse_ai.spam.spam_detected"),
126126
value: this.stats.spam_detected,
127127
};
128+
129+
const falsePositives = {
130+
label: i18n("discourse_ai.spam.false_positives"),
131+
value: this.stats.false_positives,
132+
tooltip: i18n("discourse_ai.spam.stat_tooltips.incorrectly_flagged"),
133+
};
134+
135+
const falseNegatives = {
136+
label: i18n("discourse_ai.spam.false_negatives"),
137+
value: this.stats.false_negatives,
138+
tooltip: i18n("discourse_ai.spam.stat_tooltips.missed_spam"),
139+
};
140+
128141
if (this.args.model.flagging_username) {
129142
detected.href = getURL(
130-
"/review?flagged_by=" + this.args.model.flagging_username
143+
`/review?flagged_by=${this.args.model.flagging_username}&status=all&sort_order=created_at`
144+
);
145+
146+
falsePositives.href = getURL(
147+
`/review?flagged_by=${this.args.model.flagging_username}&status=rejected&sort_order=created_at`
148+
);
149+
150+
falseNegatives.href = getURL(
151+
`/review?status=approved&sort_order=created_at&additional_filters={"ai_spam_false_negative":true}&order=created&score_type=${this.args.model.spam_score_type}`
131152
);
132153
}
133154
return [
@@ -136,16 +157,8 @@ export default class AiSpam extends Component {
136157
value: this.stats.scanned_count,
137158
},
138159
detected,
139-
{
140-
label: i18n("discourse_ai.spam.false_positives"),
141-
value: this.stats.false_positives,
142-
tooltip: i18n("discourse_ai.spam.stat_tooltips.incorrectly_flagged"),
143-
},
144-
{
145-
label: i18n("discourse_ai.spam.false_negatives"),
146-
value: this.stats.false_negatives,
147-
tooltip: i18n("discourse_ai.spam.stat_tooltips.missed_spam"),
148-
},
160+
falsePositives,
161+
falseNegatives,
149162
];
150163
}
151164

lib/ai_moderation/entry_point.rb

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,25 @@ def inject_into(plugin)
1111
plugin.on(:site_setting_changed) do |name, _old_value, new_value|
1212
SpamScanner.ensure_flagging_user! if name == :ai_spam_detection_enabled && new_value
1313
end
14+
15+
custom_filter = [
16+
:ai_spam_false_negative,
17+
Proc.new do |results, value|
18+
if value
19+
results.where(<<~SQL)
20+
EXISTS (
21+
SELECT 1 FROM ai_spam_logs
22+
WHERE NOT is_spam
23+
AND post_id = target_id AND target_type = 'Post'
24+
)
25+
SQL
26+
else
27+
results
28+
end
29+
end,
30+
]
31+
32+
Reviewable.add_custom_filter(custom_filter)
1433
end
1534
end
1635
end

lib/ai_moderation/spam_report.rb

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,33 +14,34 @@ def self.generate(min_date: 1.week.ago)
1414
asl.post_id,
1515
asl.is_spam,
1616
r.status as reviewable_status,
17-
r.target_type,
18-
r.potential_spam
17+
CASE WHEN EXISTS (
18+
SELECT 1 FROM reviewable_scores rs
19+
JOIN reviewables r1 ON r1.id = rs.reviewable_id
20+
WHERE r1.target_id = asl.post_id
21+
AND r1.target_type = 'Post'
22+
AND rs.reviewable_score_type = :spam_score_type
23+
AND NOT is_spam
24+
AND r1.status IN (:spam)
25+
) THEN true ELSE false END AS missed_spam
1926
FROM ai_spam_logs asl
2027
LEFT JOIN reviewables r ON r.id = asl.reviewable_id
2128
WHERE asl.created_at > :min_date
22-
),
23-
post_reviewables AS (
24-
SELECT
25-
target_id post_id,
26-
COUNT(DISTINCT target_id) as false_negative_count
27-
FROM reviewables
28-
WHERE target_type = 'Post'
29-
AND status IN (:spam)
30-
AND potential_spam
31-
AND target_id IN (SELECT post_id FROM spam_stats)
32-
GROUP BY target_id
3329
)
3430
SELECT
3531
COUNT(*) AS scanned_count,
3632
SUM(CASE WHEN is_spam THEN 1 ELSE 0 END) AS spam_detected,
3733
COUNT(CASE WHEN reviewable_status IN (:ham) THEN 1 END) AS false_positives,
38-
COALESCE(SUM(pr.false_negative_count), 0) AS false_negatives
34+
COUNT(CASE WHEN missed_spam THEN 1 END) AS false_negatives
3935
FROM spam_stats
40-
LEFT JOIN post_reviewables pr USING (post_id)
4136
SQL
4237

43-
DB.query(sql, spam: spam_status, ham: ham_status, min_date: min_date).first
38+
DB.query(
39+
sql,
40+
spam: spam_status,
41+
ham: ham_status,
42+
min_date: min_date,
43+
spam_score_type: ReviewableScore.types[:spam],
44+
).first
4445
end
4546
end
4647
end
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# frozen_string_literal: true
2+
3+
RSpec.describe ReviewablesController do
4+
fab!(:post1) { Fabricate(:post) }
5+
fab!(:post2) { Fabricate(:post) }
6+
fab!(:admin)
7+
fab!(:llm_model)
8+
9+
fab!(:reviewable) do
10+
Reviewable.create!(
11+
target: post1,
12+
topic: post2.topic,
13+
type: ReviewablePost,
14+
created_by: admin,
15+
status: Reviewable.statuses[:pending],
16+
)
17+
end
18+
19+
fab!(:reviewable2) do
20+
Reviewable.create!(
21+
target: post2,
22+
topic: post2.topic,
23+
type: ReviewablePost,
24+
created_by: admin,
25+
status: Reviewable.statuses[:pending],
26+
)
27+
end
28+
29+
fab!(:ai_spam_log_missed) do
30+
AiSpamLog.create!(is_spam: false, post_id: post1.id, llm_model_id: llm_model.id)
31+
end
32+
# we amend the behavior with a custom filter so we need to confirm it works
33+
it "properly applies custom filter" do
34+
sign_in(admin)
35+
36+
get '/review.json?additional_filters={"ai_spam_false_negative":true}'
37+
expect(response.status).to eq(200)
38+
39+
json = JSON.parse(response.body)
40+
expect(json["reviewables"].length).to eq(1)
41+
42+
get "/review.json"
43+
expect(response.status).to eq(200)
44+
json = JSON.parse(response.body)
45+
expect(json["reviewables"].length).to eq(2)
46+
end
47+
end

0 commit comments

Comments
 (0)