diff --git a/README.md b/README.md index 958720f..0588da7 100644 --- a/README.md +++ b/README.md @@ -1 +1,3 @@ -# yiwen-api \ No newline at end of file +# yiwen-api + +yiwen.ai 是一个由人和 AI 共同驱动的知识网络平台,它基于 AI 能力能将知识文档翻译成大部分国家主流语言版本,也能对知识文档内容进行跨语言的语义搜索。 \ No newline at end of file diff --git a/go.mod b/go.mod index 4d22e54..0cf035b 100644 --- a/go.mod +++ b/go.mod @@ -30,6 +30,7 @@ require ( github.com/go-playground/universal-translator v0.18.1 // indirect github.com/leodido/go-urn v1.2.4 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect github.com/teambition/trie-mux v1.5.2 // indirect github.com/x448/float16 v0.8.4 // indirect golang.org/x/crypto v0.11.0 // indirect diff --git a/go.sum b/go.sum index f7a74c3..751e26b 100644 --- a/go.sum +++ b/go.sum @@ -46,6 +46,8 @@ github.com/redis/go-redis/v9 v9.0.5 h1:CuQcn5HIEeK7BgElubPP8CGtE0KakrnbBSTLjathl github.com/redis/go-redis/v9 v9.0.5/go.mod h1:WqMKv5vnQbRuZstUwxQI195wHy+t4PuXDOjzMvcuQHk= github.com/rs/xid v1.5.0 h1:mKX4bl4iPYJtEIxp6CYiUuLQ/8DYMoz0PUdtGgMFRVc= github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= +github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA= +github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= diff --git a/src/api/app.go b/src/api/app.go index 96eb18d..e0e3f08 100644 --- a/src/api/app.go +++ b/src/api/app.go @@ -19,7 +19,7 @@ func NewApp() *gear.App { app := gear.New() app.Set(gear.SetTrustedProxy, true) - app.Set(gear.SetBodyParser, &bodyParser{gear.DefaultBodyParser(2 << 19)}) // 1mb + app.Set(gear.SetBodyParser, &bodyParser{gear.DefaultBodyParser(2 << 18)}) // 512kb // ignore TLS handshake error app.Set(gear.SetLogger, log.New(gear.DefaultFilterWriter(), "", 0)) app.Set(gear.SetCompress, compressible.WithThreshold(256)) diff --git a/src/api/router.go b/src/api/router.go index 8ffa488..1587fd6 100644 --- a/src/api/router.go +++ b/src/api/router.go @@ -55,6 +55,7 @@ func newRouters(apis *APIs) []*gear.Router { router.Get("/v1/search/by_original_url", middleware.AuthToken.Auth, apis.Jarvis.OriginalSearch) router.Get("/v1/scraping", middleware.AuthToken.Auth, apis.Scraping.Create) + router.Post("/v1/converting", middleware.AuthToken.Auth, apis.Scraping.Convert) router.Post("/v1/creation", middleware.AuthToken.Auth, apis.Creation.Create) router.Get("/v1/creation", middleware.AuthToken.Auth, apis.Creation.Get) diff --git a/src/api/scraping.go b/src/api/scraping.go index b9cfd88..5f902bb 100644 --- a/src/api/scraping.go +++ b/src/api/scraping.go @@ -1,10 +1,15 @@ package api import ( + "io" + "mime" + "net/http" + "github.com/teambition/gear" "github.com/yiwen-ai/yiwen-api/src/bll" "github.com/yiwen-ai/yiwen-api/src/middleware" + "github.com/yiwen-ai/yiwen-api/src/util" ) type Scraping struct { @@ -32,3 +37,39 @@ func (a *Scraping) Create(ctx *gear.Context) error { } return ctx.OkSend(bll.SuccessResponse[bll.ScrapingOutput]{Result: *output}) } + +// 目前仅支持 .html, .pdf, .md, .txt 文件, +// 即:`Content-Type: text/html`, `Content-Type: application/pdf`, `Content-Type: text/markdown`, `Content-Type: text/plain` +// 上传文件时必须携带 Content-Type,请求体为文件本身,不能超过 512kb +// 服务端会自动处理字符编码。 +func (a *Scraping) Convert(ctx *gear.Context) error { + var mtype string + var err error + if mtype = ctx.GetHeader(gear.HeaderContentType); mtype == "" { + mtype = gear.MIMEOctetStream + } + mtype, _, err = mime.ParseMediaType(mtype) + if err != nil { + return gear.ErrUnsupportedMediaType.From(err) + } + + reader := http.MaxBytesReader(ctx.Res, ctx.Req.Body, 2<<18) // 512kb + buf, err := io.ReadAll(reader) + if err != nil { + reader.Close() + return gear.ErrRequestEntityTooLarge.From(err) + } + reader.Close() + buf, mtype, err = util.NormalizeFileEncodingAndType(buf, mtype) + if err != nil { + return err + } + + header := gear.CtxValue[util.ContextHTTPHeader](ctx) + http.Header(*header).Set(gear.HeaderContentType, mtype) + output, err := a.blls.Webscraper.Convert(ctx, buf, mtype) + if err != nil { + return gear.ErrInternalServerError.From(err) + } + return ctx.OkSend(bll.SuccessResponse[*util.Bytes]{Result: output}) +} diff --git a/src/bll/webscraper.go b/src/bll/webscraper.go index a7f6174..7f7cb55 100644 --- a/src/bll/webscraper.go +++ b/src/bll/webscraper.go @@ -75,3 +75,12 @@ func (b *Webscraper) Create(ctx context.Context, targetUrl string) (*ScrapingOut return &output.Result, nil } + +func (b *Webscraper) Convert(ctx context.Context, file []byte, mtype string) (*util.Bytes, error) { + output := SuccessResponse[util.Bytes]{} + if err := b.svc.Post(ctx, "/v1/converting", file, &output); err != nil { + return nil, err + } + + return &output.Result, nil +} diff --git a/src/util/file.go b/src/util/file.go new file mode 100644 index 0000000..54884c8 --- /dev/null +++ b/src/util/file.go @@ -0,0 +1,57 @@ +package util + +import ( + "strings" + + "github.com/gabriel-vasile/mimetype" + "github.com/saintfish/chardet" + "github.com/teambition/gear" + "golang.org/x/text/encoding/ianaindex" + "golang.org/x/text/encoding/unicode" +) + +func init() { + mimetype.SetLimit(1024 * 1024) // 1MB +} + +func NormalizeFileEncodingAndType(buf []byte, mtype string) ([]byte, string, error) { + mt := mimetype.Detect(buf) + + var de *chardet.Detector + switch { + case mtype == "application/pdf" && mt.Is("application/pdf"): + return buf, mtype, nil + case mtype == "text/html" && mt.Is("text/html"): + de = chardet.NewHtmlDetector() + case mtype == "text/markdown" && mt.Is("text/plain"): + de = chardet.NewHtmlDetector() + case mtype == "text/plain" && mt.Is("text/plain"): + de = chardet.NewTextDetector() + default: + return nil, "", gear.ErrUnsupportedMediaType.WithMsgf("unsupported media type: %s", mt.String()) + } + + rt, err := de.DetectBest(buf) + if err != nil { + return nil, "", gear.ErrUnsupportedMediaType.From(err) + } + + enc, err := ianaindex.IANA.Encoding(rt.Charset) + if err != nil { + enc, err = ianaindex.IANA.Encoding(strings.ReplaceAll(rt.Charset, "-", "")) + } + + if err != nil { + return nil, "", gear.ErrUnsupportedMediaType.From(err) + } + + if enc != unicode.UTF8 { + decoder := enc.NewDecoder() + buf, err = decoder.Bytes(buf) + if err != nil { + return nil, "", gear.ErrUnsupportedMediaType.From(err) + } + } + + return buf, mtype, nil +} diff --git a/src/util/file_test.go b/src/util/file_test.go new file mode 100644 index 0000000..28bf0de --- /dev/null +++ b/src/util/file_test.go @@ -0,0 +1,24 @@ +package util + +import ( + "os" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestFile(t *testing.T) { + t.Run("NormalizeFileEncodingAndType", func(t *testing.T) { + file1, err := os.ReadFile("./file_test_gb18030.md") + require.NoError(t, err) + + file2, err := os.ReadFile("./file_test_utf8.md") + require.NoError(t, err) + + buf, mt, err := NormalizeFileEncodingAndType(file1, "text/markdown") + require.NoError(t, err) + assert.Equal(t, "text/markdown", mt) + assert.Equal(t, file2, buf) + }) +} diff --git a/src/util/file_test_gb18030.md b/src/util/file_test_gb18030.md new file mode 100644 index 0000000..e96a774 --- /dev/null +++ b/src/util/file_test_gb18030.md @@ -0,0 +1,3 @@ +# yiwen-api + +yiwen.ai һ˺ AI ֪ͬʶƽ̨ AI ֪ܽʶĵɴ󲿷ֹ԰汾Ҳ֪ܶʶĵݽпԵ \ No newline at end of file diff --git a/src/util/file_test_utf8.md b/src/util/file_test_utf8.md new file mode 100644 index 0000000..0588da7 --- /dev/null +++ b/src/util/file_test_utf8.md @@ -0,0 +1,3 @@ +# yiwen-api + +yiwen.ai 是一个由人和 AI 共同驱动的知识网络平台,它基于 AI 能力能将知识文档翻译成大部分国家主流语言版本,也能对知识文档内容进行跨语言的语义搜索。 \ No newline at end of file diff --git a/src/util/http.go b/src/util/http.go index 741d24c..48f8afe 100644 --- a/src/util/http.go +++ b/src/util/http.go @@ -23,8 +23,6 @@ func init() { userAgent = fmt.Sprintf("Go/%v yiwen-api", runtime.Version()) } -type ContextHTTPHeader http.Header - var userAgent string var externalTr = &http.Transport{ @@ -68,16 +66,21 @@ var HTTPClient = &http.Client{ var ErrNotFound = gear.ErrNotFound +type ContextHTTPHeader http.Header + func RequestJSON(ctx context.Context, cli *http.Client, method, api string, input, output any) error { if ctx.Err() != nil { return nil } + var err error var body io.Reader if input != nil { - data, err := json.Marshal(input) - if err != nil { - return err + data, ok := input.([]byte) + if !ok { + if data, err = json.Marshal(input); err != nil { + return err + } } body = bytes.NewReader(data) } @@ -126,11 +129,14 @@ func RequestCBOR(ctx context.Context, cli *http.Client, method, api string, inpu return nil } + var err error var body io.Reader if input != nil { - data, err := cbor.Marshal(input) - if err != nil { - return err + data, ok := input.([]byte) + if !ok { + if data, err = cbor.Marshal(input); err != nil { + return err + } } body = bytes.NewReader(data) } diff --git a/tests/api_test.go b/tests/api_test.go index 34b9bbb..d10ac84 100644 --- a/tests/api_test.go +++ b/tests/api_test.go @@ -17,6 +17,8 @@ import ( ) func TestAPI(t *testing.T) { + t.Skip("Skip") + var cookie string = "YW_DID=ciqui86rupojrarr1ag0; YW_SESS=HFmzbngWbfeJ73UpYSLRkUJxuCZQKV6ee7ti8zWAkAk" var targetUrl string = "https://datatracker.ietf.org/doc/html/rfc8949"