Skip to content

Commit 58ba1d5

Browse files
committed
magika: add ml-based content type detection
Signed-off-by: Hank Donnay <[email protected]>
1 parent 52da6d5 commit 58ba1d5

File tree

9 files changed

+1565
-0
lines changed

9 files changed

+1565
-0
lines changed

detector/magika/_cmd/ortgen/go.mod

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
module ortgen
2+
3+
go 1.24.3
4+
5+
require modernc.org/cc/v4 v4.27.1
6+
7+
require (
8+
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
9+
modernc.org/mathutil v1.7.1 // indirect
10+
modernc.org/opt v0.1.4 // indirect
11+
modernc.org/sortutil v1.2.1 // indirect
12+
modernc.org/strutil v1.2.1 // indirect
13+
modernc.org/token v1.1.0 // indirect
14+
)

detector/magika/_cmd/ortgen/go.sum

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
2+
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
3+
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
4+
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
5+
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
6+
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
7+
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
8+
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
9+
modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis=
10+
modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0=
11+
modernc.org/ccorpus2 v1.5.2 h1:Ui+4tc58mf/W+2arcYCJR903y3zl3ecsI7Fpaaqozyw=
12+
modernc.org/ccorpus2 v1.5.2/go.mod h1:Wifvo4Q/qS/h1aRoC2TffcHsnxwTikmi1AuLANuucJQ=
13+
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
14+
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
15+
modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
16+
modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
17+
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
18+
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
19+
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
20+
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
21+
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
22+
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
// Ortgen is a helper program to generate type information from the ONNX Runtime
2+
// release tarball.
3+
package main
4+
5+
import (
6+
"archive/tar"
7+
"bufio"
8+
"bytes"
9+
"compress/gzip"
10+
"context"
11+
"flag"
12+
"fmt"
13+
"io"
14+
"io/fs"
15+
"log/slog"
16+
"net/http"
17+
"os"
18+
"path/filepath"
19+
"strconv"
20+
"strings"
21+
"text/template"
22+
23+
"modernc.org/cc/v4"
24+
)
25+
26+
const urlTmpl = "https://github.com/microsoft/onnxruntime/releases/download/v{{.}}/onnxruntime-linux-x64-{{.}}.tgz"
27+
28+
func main() {
29+
var code int
30+
defer func() {
31+
if code != 0 {
32+
os.Exit(code)
33+
}
34+
}()
35+
var out *os.File
36+
tmpl := template.Must(template.New(`url`).Parse(urlTmpl))
37+
var loglevel slog.LevelVar
38+
loglevel.Set(slog.LevelError)
39+
slog.SetDefault(slog.New(slog.NewTextHandler(
40+
os.Stderr,
41+
&slog.HandlerOptions{
42+
Level: &loglevel,
43+
})))
44+
ctx := context.Background()
45+
46+
vers := flag.String("version", `1.15.1`, "use `version` as the argument to the URL template")
47+
flag.Func("url", "fetch ONNX Runtime release tarball from templated `URL`", func(text string) error {
48+
if _, err := tmpl.Parse(text); err != nil {
49+
return err
50+
}
51+
return nil
52+
})
53+
pkg := flag.String("package", "magika", "generated package `name`")
54+
flag.Func("o", "output to `file`", func(p string) error {
55+
if out != nil {
56+
if err := out.Close(); err != nil {
57+
return err
58+
}
59+
}
60+
f, err := os.Create(p)
61+
if err != nil {
62+
return err
63+
}
64+
out = f
65+
return nil
66+
})
67+
flag.BoolFunc("D", "debug log output", func(v string) error {
68+
if ok, err := strconv.ParseBool(v); err == nil && ok {
69+
loglevel.Set(slog.LevelDebug)
70+
}
71+
return nil
72+
})
73+
flag.Parse()
74+
75+
var sb strings.Builder
76+
if err := tmpl.Execute(&sb, *vers); err != nil {
77+
slog.ErrorContext(ctx, "unexpected template error", "reason", err)
78+
code = 1
79+
return
80+
}
81+
slog.InfoContext(ctx, "templated URL", "url", &sb)
82+
83+
if out == nil {
84+
out = os.Stdout
85+
}
86+
defer out.Close()
87+
88+
if err := Main(ctx, out, sb.String(), *pkg); err != nil {
89+
slog.ErrorContext(ctx, "unexpected error", "reason", err)
90+
code = 1
91+
}
92+
}
93+
94+
const genHeader = `// Code generated by ortgen. DO NOT EDIT.`
95+
96+
func Main(ctx context.Context, out io.Writer, in, pkg string) error {
97+
const header = `onnxruntime_c_api.h`
98+
dir, err := fetchTarball(ctx, in)
99+
if err != nil {
100+
return fmt.Errorf("fetching: %w", err)
101+
}
102+
defer os.RemoveAll(dir.Name())
103+
defer dir.Close()
104+
105+
cfg, err := cc.NewConfig("linux", "amd64")
106+
if err != nil {
107+
return fmt.Errorf("unable to create config: %w", err)
108+
}
109+
ms, _ := fs.Glob(dir.FS(), `*/include/`+header)
110+
f, err := dir.Open(ms[0])
111+
if err != nil {
112+
return fmt.Errorf("unable to open header: %q: %w", ms[0], err)
113+
}
114+
src := []cc.Source{
115+
{Name: "<predefined>", Value: cfg.Predefined},
116+
{Name: "<builtin>", Value: cc.Builtin},
117+
{Name: "<bool>", Value: "#define bool _Bool\n"},
118+
{Name: header, Value: f},
119+
}
120+
ast, err := cc.Translate(cfg, src)
121+
if err != nil {
122+
return fmt.Errorf("parse error: %w", err)
123+
}
124+
125+
fmt.Fprintf(out, "%s\n\npackage %s\n\nimport \"structs\"\n", genHeader, pkg)
126+
// Walk the list, looking for the bits we want:
127+
var buf bytes.Buffer
128+
for cur := ast.TranslationUnit; cur != nil; cur = cur.TranslationUnit {
129+
buf.Reset()
130+
if cur.ExternalDeclaration.Case != cc.ExternalDeclarationDecl {
131+
continue
132+
}
133+
decl := cur.ExternalDeclaration.Declaration
134+
if decl.Position().Filename != header {
135+
continue
136+
}
137+
spec := decl.DeclarationSpecifiers
138+
if spec == nil || spec.Case != cc.DeclarationSpecifiersTypeSpec {
139+
continue
140+
}
141+
ty := spec.TypeSpecifier
142+
if ty == nil || ty.Case != cc.TypeSpecifierStructOrUnion {
143+
continue
144+
}
145+
structSpec := ty.StructOrUnionSpecifier
146+
if structSpec == nil ||
147+
structSpec.Case != cc.StructOrUnionSpecifierDef ||
148+
structSpec.StructOrUnion.Case != cc.StructOrUnionStruct {
149+
continue
150+
}
151+
152+
n := structSpec.Token.SrcStr()
153+
switch n {
154+
case "OrtApi":
155+
case "OrtApiBase":
156+
default:
157+
continue
158+
}
159+
fmt.Fprintf(&buf, "\ntype %s struct {\n\t_ structs.HostLayout\n\n", strings.Replace(n, "O", "o", 1))
160+
161+
for cur := structSpec.StructDeclarationList; cur != nil; cur = cur.StructDeclarationList {
162+
buf.WriteString("\t// ")
163+
buf.WriteString(cc.NodeSource(cur.StructDeclaration))
164+
buf.WriteByte('\n')
165+
166+
// pull out the function pointer ident:
167+
decl := cur.StructDeclaration.StructDeclaratorList.StructDeclarator.Declarator.DirectDeclarator.DirectDeclarator.Declarator.DirectDeclarator
168+
buf.WriteByte('\t')
169+
buf.Write(decl.Token.Src())
170+
buf.WriteString(" uintptr\n")
171+
}
172+
173+
buf.WriteString("}\n")
174+
if _, err := io.Copy(out, &buf); err != nil {
175+
return err
176+
}
177+
}
178+
179+
return nil
180+
}
181+
182+
func fetchTarball(ctx context.Context, in string) (*os.Root, error) {
183+
slog.DebugContext(ctx, "fetching tarball", "url", in)
184+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, in, nil)
185+
if err != nil {
186+
return nil, err
187+
}
188+
res, err := http.DefaultClient.Do(req)
189+
if err != nil {
190+
return nil, err
191+
}
192+
if res.StatusCode != http.StatusOK {
193+
return nil, fmt.Errorf("unexpected status: %v", res.Status)
194+
}
195+
defer res.Body.Close()
196+
z, err := gzip.NewReader(res.Body)
197+
if err != nil {
198+
return nil, err
199+
}
200+
201+
d, err := os.MkdirTemp("", "ortgen.")
202+
if err != nil {
203+
return nil, err
204+
}
205+
slog.DebugContext(ctx, "untarring to disk", "path", d)
206+
root, err := os.OpenRoot(d)
207+
if err != nil {
208+
return nil, err
209+
}
210+
211+
ok := false
212+
defer func() {
213+
if !ok {
214+
os.RemoveAll(d)
215+
root.Close()
216+
}
217+
}()
218+
219+
rd := tar.NewReader(bufio.NewReader(z))
220+
h, err := rd.Next()
221+
for ; err == nil; h, err = rd.Next() {
222+
err := func() error {
223+
p := filepath.Join(".", h.Name)
224+
fi := h.FileInfo()
225+
if fi.IsDir() {
226+
return root.Mkdir(p, 0o755)
227+
}
228+
f, err := root.Create(p)
229+
if err != nil {
230+
return err
231+
}
232+
defer f.Close()
233+
234+
if _, err := io.Copy(f, rd); err != nil {
235+
return err
236+
}
237+
return nil
238+
}()
239+
if err != nil {
240+
return nil, err
241+
}
242+
}
243+
244+
ok = true
245+
return root, nil
246+
}

0 commit comments

Comments
 (0)