Skip to content

Commit

Permalink
Add file converting api
Browse files Browse the repository at this point in the history
  • Loading branch information
zensh committed Aug 8, 2023
1 parent 8baf5c5 commit e8e92c2
Show file tree
Hide file tree
Showing 13 changed files with 161 additions and 10 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
# yiwen-api
# yiwen-api

yiwen.ai 是一个由人和 AI 共同驱动的知识网络平台,它基于 AI 能力能将知识文档翻译成大部分国家主流语言版本,也能对知识文档内容进行跨语言的语义搜索。
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ require (
github.com/go-playground/universal-translator v0.18.1 // indirect
github.com/leodido/go-urn v1.2.4 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
github.com/teambition/trie-mux v1.5.2 // indirect
github.com/x448/float16 v0.8.4 // indirect
golang.org/x/crypto v0.11.0 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ github.com/redis/go-redis/v9 v9.0.5 h1:CuQcn5HIEeK7BgElubPP8CGtE0KakrnbBSTLjathl
github.com/redis/go-redis/v9 v9.0.5/go.mod h1:WqMKv5vnQbRuZstUwxQI195wHy+t4PuXDOjzMvcuQHk=
github.com/rs/xid v1.5.0 h1:mKX4bl4iPYJtEIxp6CYiUuLQ/8DYMoz0PUdtGgMFRVc=
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
Expand Down
2 changes: 1 addition & 1 deletion src/api/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ func NewApp() *gear.App {
app := gear.New()

app.Set(gear.SetTrustedProxy, true)
app.Set(gear.SetBodyParser, &bodyParser{gear.DefaultBodyParser(2 << 19)}) // 1mb
app.Set(gear.SetBodyParser, &bodyParser{gear.DefaultBodyParser(2 << 18)}) // 512kb
// ignore TLS handshake error
app.Set(gear.SetLogger, log.New(gear.DefaultFilterWriter(), "", 0))
app.Set(gear.SetCompress, compressible.WithThreshold(256))
Expand Down
1 change: 1 addition & 0 deletions src/api/router.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ func newRouters(apis *APIs) []*gear.Router {
router.Get("/v1/search/by_original_url", middleware.AuthToken.Auth, apis.Jarvis.OriginalSearch)

router.Get("/v1/scraping", middleware.AuthToken.Auth, apis.Scraping.Create)
router.Post("/v1/converting", middleware.AuthToken.Auth, apis.Scraping.Convert)

router.Post("/v1/creation", middleware.AuthToken.Auth, apis.Creation.Create)
router.Get("/v1/creation", middleware.AuthToken.Auth, apis.Creation.Get)
Expand Down
41 changes: 41 additions & 0 deletions src/api/scraping.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
package api

import (
"io"
"mime"
"net/http"

"github.com/teambition/gear"

"github.com/yiwen-ai/yiwen-api/src/bll"
"github.com/yiwen-ai/yiwen-api/src/middleware"
"github.com/yiwen-ai/yiwen-api/src/util"
)

type Scraping struct {
Expand Down Expand Up @@ -32,3 +37,39 @@ func (a *Scraping) Create(ctx *gear.Context) error {
}
return ctx.OkSend(bll.SuccessResponse[bll.ScrapingOutput]{Result: *output})
}

// 目前仅支持 .html, .pdf, .md, .txt 文件,
// 即:`Content-Type: text/html`, `Content-Type: application/pdf`, `Content-Type: text/markdown`, `Content-Type: text/plain`
// 上传文件时必须携带 Content-Type,请求体为文件本身,不能超过 512kb
// 服务端会自动处理字符编码。
func (a *Scraping) Convert(ctx *gear.Context) error {
var mtype string
var err error
if mtype = ctx.GetHeader(gear.HeaderContentType); mtype == "" {
mtype = gear.MIMEOctetStream
}
mtype, _, err = mime.ParseMediaType(mtype)
if err != nil {
return gear.ErrUnsupportedMediaType.From(err)
}

reader := http.MaxBytesReader(ctx.Res, ctx.Req.Body, 2<<18) // 512kb
buf, err := io.ReadAll(reader)
if err != nil {
reader.Close()
return gear.ErrRequestEntityTooLarge.From(err)
}
reader.Close()
buf, mtype, err = util.NormalizeFileEncodingAndType(buf, mtype)
if err != nil {
return err
}

header := gear.CtxValue[util.ContextHTTPHeader](ctx)
http.Header(*header).Set(gear.HeaderContentType, mtype)
output, err := a.blls.Webscraper.Convert(ctx, buf, mtype)
if err != nil {
return gear.ErrInternalServerError.From(err)
}
return ctx.OkSend(bll.SuccessResponse[*util.Bytes]{Result: output})
}
9 changes: 9 additions & 0 deletions src/bll/webscraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,12 @@ func (b *Webscraper) Create(ctx context.Context, targetUrl string) (*ScrapingOut

return &output.Result, nil
}

func (b *Webscraper) Convert(ctx context.Context, file []byte, mtype string) (*util.Bytes, error) {
output := SuccessResponse[util.Bytes]{}
if err := b.svc.Post(ctx, "/v1/converting", file, &output); err != nil {
return nil, err
}

return &output.Result, nil
}
57 changes: 57 additions & 0 deletions src/util/file.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package util

import (
"strings"

"github.com/gabriel-vasile/mimetype"
"github.com/saintfish/chardet"
"github.com/teambition/gear"
"golang.org/x/text/encoding/ianaindex"
"golang.org/x/text/encoding/unicode"
)

func init() {
mimetype.SetLimit(1024 * 1024) // 1MB
}

func NormalizeFileEncodingAndType(buf []byte, mtype string) ([]byte, string, error) {
mt := mimetype.Detect(buf)

var de *chardet.Detector
switch {
case mtype == "application/pdf" && mt.Is("application/pdf"):
return buf, mtype, nil
case mtype == "text/html" && mt.Is("text/html"):
de = chardet.NewHtmlDetector()
case mtype == "text/markdown" && mt.Is("text/plain"):
de = chardet.NewHtmlDetector()
case mtype == "text/plain" && mt.Is("text/plain"):
de = chardet.NewTextDetector()
default:
return nil, "", gear.ErrUnsupportedMediaType.WithMsgf("unsupported media type: %s", mt.String())
}

rt, err := de.DetectBest(buf)
if err != nil {
return nil, "", gear.ErrUnsupportedMediaType.From(err)
}

enc, err := ianaindex.IANA.Encoding(rt.Charset)
if err != nil {
enc, err = ianaindex.IANA.Encoding(strings.ReplaceAll(rt.Charset, "-", ""))
}

if err != nil {
return nil, "", gear.ErrUnsupportedMediaType.From(err)
}

if enc != unicode.UTF8 {
decoder := enc.NewDecoder()
buf, err = decoder.Bytes(buf)
if err != nil {
return nil, "", gear.ErrUnsupportedMediaType.From(err)
}
}

return buf, mtype, nil
}
24 changes: 24 additions & 0 deletions src/util/file_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package util

import (
"os"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestFile(t *testing.T) {
t.Run("NormalizeFileEncodingAndType", func(t *testing.T) {
file1, err := os.ReadFile("./file_test_gb18030.md")
require.NoError(t, err)

file2, err := os.ReadFile("./file_test_utf8.md")
require.NoError(t, err)

buf, mt, err := NormalizeFileEncodingAndType(file1, "text/markdown")
require.NoError(t, err)
assert.Equal(t, "text/markdown", mt)
assert.Equal(t, file2, buf)
})
}
3 changes: 3 additions & 0 deletions src/util/file_test_gb18030.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# yiwen-api

yiwen.ai 是一个由人和 AI 共同驱动的知识网络平台,它基于 AI 能力能将知识文档翻译成大部分国家主流语言版本,也能对知识文档内容进行跨语言的语义搜索。
3 changes: 3 additions & 0 deletions src/util/file_test_utf8.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# yiwen-api

yiwen.ai 是一个由人和 AI 共同驱动的知识网络平台,它基于 AI 能力能将知识文档翻译成大部分国家主流语言版本,也能对知识文档内容进行跨语言的语义搜索。
22 changes: 14 additions & 8 deletions src/util/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ func init() {
userAgent = fmt.Sprintf("Go/%v yiwen-api", runtime.Version())
}

type ContextHTTPHeader http.Header

var userAgent string

var externalTr = &http.Transport{
Expand Down Expand Up @@ -68,16 +66,21 @@ var HTTPClient = &http.Client{

var ErrNotFound = gear.ErrNotFound

type ContextHTTPHeader http.Header

func RequestJSON(ctx context.Context, cli *http.Client, method, api string, input, output any) error {
if ctx.Err() != nil {
return nil
}

var err error
var body io.Reader
if input != nil {
data, err := json.Marshal(input)
if err != nil {
return err
data, ok := input.([]byte)
if !ok {
if data, err = json.Marshal(input); err != nil {
return err
}
}
body = bytes.NewReader(data)
}
Expand Down Expand Up @@ -126,11 +129,14 @@ func RequestCBOR(ctx context.Context, cli *http.Client, method, api string, inpu
return nil
}

var err error
var body io.Reader
if input != nil {
data, err := cbor.Marshal(input)
if err != nil {
return err
data, ok := input.([]byte)
if !ok {
if data, err = cbor.Marshal(input); err != nil {
return err
}
}
body = bytes.NewReader(data)
}
Expand Down
2 changes: 2 additions & 0 deletions tests/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ import (
)

func TestAPI(t *testing.T) {
t.Skip("Skip")

var cookie string = "YW_DID=ciqui86rupojrarr1ag0; YW_SESS=HFmzbngWbfeJ73UpYSLRkUJxuCZQKV6ee7ti8zWAkAk"

var targetUrl string = "https://datatracker.ietf.org/doc/html/rfc8949"
Expand Down

0 comments on commit e8e92c2

Please sign in to comment.