Skip to content

Commit ebf610d

Browse files
committed
Initial commit
0 parents  commit ebf610d

9 files changed

+294
-0
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
config.json
2+
tmp/
3+
dump.db

Gemfile

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# frozen_string_literal: true
2+
3+
source "https://rubygems.org"
4+
5+
gem "sqlite3"
6+
gem "mini_sql"

Gemfile.lock

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
GEM
2+
remote: https://rubygems.org/
3+
specs:
4+
mini_sql (1.4.0)
5+
sqlite3 (1.6.2-x86_64-linux)
6+
7+
PLATFORMS
8+
x86_64-linux
9+
10+
DEPENDENCIES
11+
mini_sql
12+
sqlite3
13+
14+
BUNDLED WITH
15+
2.4.7

README.md

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
### Public Data Dump for you forum
2+
3+
This repo attempts to establish a pattern for a public data dump. It includes 2 data explorer queries you can use to export all your public data.
4+
5+
Public data is defined as forum topics and posts that anonymous users can access.
6+
7+
### How to use this?
8+
9+
First you need to define 2 queries using data explorer:
10+
11+
1. Topic query: [here](topic_query.sql)
12+
2. Post query: [here](post_query.sql)
13+
14+
Once defined note the data explorer query ids as specified in the URL
15+
16+
Next, define an API key with rights to run the 2 queries.
17+
18+
### config.json
19+
20+
Create a [config.json](config.json.sample) specifying the domain of your discourse site, api key and data explorer query ids.
21+
22+
### Importing the site into Sqlite
23+
24+
The first phase of the import is importing the site into a sqlite3 db. This intermediary db stores all the content.
25+
26+
Run: `ruby download_topics.rb`
27+
28+
### Importing the Sqlite db into Discourse
29+
30+
1. Start with a blank DB
31+
2. ... (in progress)

config.json.sample

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"api_username": "USER",
3+
"api_key": "API_KEY",
4+
"topics_query_id": "QUERY_ID",
5+
"posts_query_id": "QUERY_ID",
6+
"domain": "YOUR_DISCOURSE.com"
7+
}

download_topics.rb

+168
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
require "json"
2+
require "net/http"
3+
require "uri"
4+
require "sqlite3"
5+
require "mini_sql"
6+
require "cgi"
7+
8+
begin
9+
config = JSON.parse(File.read("config.json"))
10+
rescue StandardError
11+
puts "Please create a file called .creds with your API KEY and USERNAME"
12+
end
13+
14+
# Replace these values with your Discourse instance details
15+
DISCOURSE_DOMAIN = config["domain"]
16+
API_KEY = config["api_key"]
17+
API_USERNAME = config["api_username"]
18+
TOPIC_QUERY_ID = config["topics_query_id"]
19+
POST_QUERY_ID = config["posts_query_id"]
20+
21+
sqlite_conn = SQLite3::Database.new("dump.db")
22+
conn = MiniSql::Connection.get(sqlite_conn)
23+
24+
def run_report(query_id:, min_id: 0, limit:)
25+
params = CGI.escape({ min_id: min_id.to_s }.to_json)
26+
27+
uri =
28+
URI(
29+
"https://#{DISCOURSE_DOMAIN}/admin/plugins/explorer/queries/#{query_id}/run?limit=#{limit}&params=#{params}"
30+
)
31+
http = Net::HTTP.new(uri.host, uri.port)
32+
http.use_ssl = true
33+
34+
request = Net::HTTP::Post.new(uri.request_uri)
35+
request["Content-Type"] = "application/json"
36+
request["Api-Key"] = API_KEY
37+
request["Api-Username"] = API_USERNAME
38+
39+
response = http.request(request)
40+
JSON.parse(response.body)
41+
end
42+
43+
def create_schema(conn)
44+
conn.exec <<-SQL
45+
CREATE TABLE IF NOT EXISTS topics (
46+
id INTEGER PRIMARY KEY,
47+
category,
48+
title,
49+
created_at,
50+
user_id,
51+
tags
52+
)
53+
SQL
54+
55+
conn.exec <<-SQL
56+
CREATE TABLE IF NOT EXISTS users(
57+
id INTEGER PRIMARY KEY,
58+
username,
59+
name
60+
)
61+
SQL
62+
63+
conn.exec <<-SQL
64+
CREATE TABLE IF NOT EXISTS posts(
65+
id INTEGER PRIMARY KEY,
66+
raw,
67+
post_number,
68+
topic_id,
69+
user_id,
70+
created_at
71+
)
72+
SQL
73+
end
74+
75+
def load_posts(conn, rows)
76+
highest_id = 0
77+
posts_loaded = 0
78+
79+
conn.exec "BEGIN TRANSACTION"
80+
81+
rows.each do |row|
82+
conn.exec <<~SQL, *row
83+
INSERT OR IGNORE INTO posts (id, raw, post_number, topic_id, user_id, created_at)
84+
VALUES (?, ?, ?, ?, ?, ?)
85+
SQL
86+
posts_loaded += 1
87+
highest_id = row[0] if row[0] > highest_id
88+
end
89+
90+
conn.exec "COMMIT TRANSACTION"
91+
92+
{ highest_id: highest_id, posts_loaded: posts_loaded }
93+
end
94+
95+
def load_topics(conn, rows)
96+
highest_id = 0
97+
topics_loaded = 0
98+
99+
conn.exec "BEGIN TRANSACTION"
100+
101+
rows.each do |row|
102+
conn.exec <<~SQL, *row
103+
INSERT OR IGNORE INTO topics (id, category, title, created_at, user_id, tags)
104+
VALUES (?, ?, ?, ?, ?, ?)
105+
SQL
106+
topics_loaded += 1
107+
highest_id = row[0] if row[0] > highest_id
108+
end
109+
110+
conn.exec "COMMIT TRANSACTION"
111+
112+
{ highest_id: highest_id, topics_loaded: topics_loaded }
113+
end
114+
115+
def load_users(conn, rows)
116+
conn.exec "BEGIN TRANSACTION"
117+
loaded = 0
118+
119+
rows.each do |row|
120+
conn.exec <<~SQL, *row
121+
INSERT OR IGNORE INTO users(id, username, name)
122+
VALUES (?, ?, ?)
123+
SQL
124+
loaded += 1
125+
end
126+
127+
conn.exec "COMMIT TRANSACTION"
128+
loaded
129+
end
130+
131+
def load_users_from_json(conn, json)
132+
users = json.dig("relations", "user")
133+
if users
134+
users = users.map { |user| [user["id"], user["username"], user["name"]] }
135+
loaded = load_users(conn, users)
136+
puts "Loaded #{loaded} users"
137+
end
138+
end
139+
140+
create_schema(conn)
141+
142+
min_id = 0
143+
while true
144+
response_data =
145+
run_report(query_id: TOPIC_QUERY_ID, min_id: min_id, limit: 10_000)
146+
147+
load_users_from_json(conn, response_data)
148+
149+
result = load_topics(conn, response_data["rows"])
150+
puts "Loaded #{result[:topics_loaded]} topics (highest id is #{result[:highest_id]})"
151+
152+
min_id = result[:highest_id]
153+
break if result[:topics_loaded] == 0
154+
end
155+
156+
min_id = 0
157+
while true
158+
response_data =
159+
run_report(query_id: POST_QUERY_ID, min_id: min_id, limit: 10_000)
160+
161+
load_users_from_json(conn, response_data)
162+
163+
result = load_posts(conn, response_data["rows"])
164+
puts "Loaded #{result[:posts_loaded]} posts (highest id is #{result[:highest_id]})"
165+
166+
min_id = result[:highest_id]
167+
break if result[:posts_loaded] == 0
168+
end

import_db.rb

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
require "sqlite3"
2+
require "mini_sql"
3+
4+
sqlite_conn = SQLite3::Database.new("dump.db")
5+
conn = MiniSql::Connection.get(sqlite_conn)
6+
7+
Dir.chdir("/home/sam/Source/discourse")
8+
require "/home/sam/Source/discourse/config/environment"
9+
10+
puts "Importing users..."
11+
12+
created = 0
13+
conn
14+
.query("SELECT * FROM users")
15+
.each do |row|
16+
if !User.exists?(row.id)
17+
User.create(
18+
id: row.id,
19+
username: row.username,
20+
name: row.username,
21+
password: SecureRandom.hex,
22+
email: "#{SecureRandom.hex}@email.com"
23+
)
24+
end
25+
print "."
26+
created += 1
27+
puts "#{created} users created" if created % 500 == 0
28+
end

post_query.sql

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
-- [params]
2+
-- int :min_id = 0
3+
4+
SELECT
5+
p.id,
6+
p.raw,
7+
p.post_number,
8+
p.topic_id,
9+
p.user_id,
10+
p.created_at
11+
FROM topics t
12+
JOIN posts p ON p.topic_id = t.id
13+
JOIN categories c ON c.id = t.category_id
14+
WHERE NOT c.read_restricted
15+
AND t.deleted_at IS NULL
16+
AND p.deleted_at IS NULL
17+
AND p.post_type = 1
18+
AND NOT p.hidden
19+
AND p.id > :min_id
20+
ORDER BY p.id ASC
21+

topic_query.sql

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
-- [params]
2+
-- int :min_id = 0
3+
4+
SELECT
5+
t.id,
6+
c.name,
7+
t.title,
8+
t.created_at,
9+
t.user_id,
10+
(SELECT STRING_AGG(tag.name, ', ') FROM topic_tags tt JOIN tags tag ON tag.id = tt.tag_id WHERE tt.topic_id = t.id) AS all_tags
11+
FROM topics t
12+
JOIN categories c ON c.id = t.category_id
13+
WHERE NOT c.read_restricted AND t.deleted_at IS NULL
14+
AND t.id > :min_id
15+
ORDER BY t.id ASC

0 commit comments

Comments
 (0)