forked from muhammad-abdurrahman/hans-wehr-fodt-parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.rb
executable file
·72 lines (51 loc) · 2.11 KB
/
parser.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
require 'nokogiri'
require 'json'
require "sqlite3"
# require 'pry'
# # check unicode values for arabic characters (between 1571 and 1618)
# ara_chars = "ابتثجحخدذرزسشصضطظعغفقكلمنهويأإىؤءئًٌٍَُِّْ"
# ara_chars.chars.each { |char| puts char + ": " + char.ord.to_s }
hw_source = File.open("hanswehr.xml") { |f| Nokogiri::XML(f) }
styles = Hash.new {|h,k| h[k]=[]}
hw_source.xpath("//style:paragraph-properties[@fo:margin-left]").each do |s|
styles["#{s["fo:margin-left"].delete('in').to_f + s["fo:text-indent"].delete('in').to_f}"] << s.parent["style:name"]
end
$root_word_styles = styles["0.0"]
word_regex = /(?<= |^)[\u0620-\u0660 ]+(?= |$)/
current_root = nil;
autonum = 1
def check_is_root(tag)
styleMatch = $root_word_styles.include? tag.attributes["style-name"].value
# matches a lot of root words based on fitting the expression "فعل fa'ala a" with some extra checks
# has to be directly at the beginning of the definition
# rootBeginningRegex = /^\d?[\u0620-\u0660]{3} *[a-z']+ *[aiu] /
regexMatch = false; # not yet implemented
romanNumeralsMatch = false # not yet implemented
return styleMatch || regexMatch || romanNumeralsMatch
end
# Open a database
db = SQLite3::Database.new "hanswehr.db"
puts "Open"
current_root = autonum
insert = db.prepare <<-SQL
INSERT INTO WordView (rowid, RootWordId, ArabicWord, IsRoot, Definition)
VALUES (?,?,?,?,?)
SQL
hw_source.xpath("//office:text/text:p")
.each{ |tag|
word = {
id: autonum,
word: word_regex.match(tag.text).to_s,
text: tag.text,
is_root: check_is_root(tag),
root: current_root
}
insert.execute word[:id],word[:root],word[:word],word[:is_root] ? 1 : 0,word[:text]
current_root = autonum if word[:is_root]
autonum += 1
}
# File.write 'results.json', root_words[0...10000].to_json
# Pry::ColorPrinter.pp(styles.sort_by{|k,v| k}.to_h)
# f_out = File.new("out.txt", "w+")
# f_out.write hw_source.xpath("//text:p[@text:style-name='P15']").first
# f_out.close