Merge pull request #5 from knights-analytics/awesome-go

Awesome go
knights-analytics · Mar 12, 2024 · a586141 · a586141
2 parents 1784bb9 + bdd5135
commit a586141
Show file tree

Hide file tree

Showing 23 changed files with 692 additions and 307 deletions.
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -10,27 +10,38 @@ permissions:
 
 jobs:
   test:
-    name: Run test suite
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
-    - name: Build and run tests
-      run: make run-tests
-    - name: print
-      run: ls -la
-    - uses: actions/upload-artifact@v4
-      with:
-        name: libtokenizers.a
-        path: ./artifacts/libtokenizers.a
-    - uses: actions/upload-artifact@v4
-      with:
-        name: onnxruntime.so
-        path: ./artifacts/onnxruntime.so
-    - uses: actions/upload-artifact@v4
-      with:
-        name: hugot-cli-linux-amd64 
-        path: ./artifacts/hugot-cli-linux-amd64
+      name: Run test suite
+      runs-on: ubuntu-latest
+      steps:
+      - name: Set up Go
+        uses: actions/setup-go@v2
+        with:
+          go-version: '1.20.0'
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Install dependencies
+        run: |
+          go mod download
+      - name: Build and run tests
+        run: make run-tests
+      - name: Install goveralls
+        run: go install github.com/mattn/goveralls@latest
+      - name: Send coverage
+        env:
+          COVERALLS_TOKEN: ${{ secrets.COVERALLS_TOKEN }}
+        run: goveralls -coverprofile=./testTarget/unit/cover.out -service=github
+      - uses: actions/upload-artifact@v4
+        with:
+          name: libtokenizers.a
+          path: ./artifacts/libtokenizers.a
+      - uses: actions/upload-artifact@v4
+        with:
+          name: onnxruntime.so
+          path: ./artifacts/onnxruntime.so
+      - uses: actions/upload-artifact@v4
+        with:
+          name: hugot-cli-linux-amd64 
+          path: ./artifacts/hugot-cli-linux-amd64
   release:
     name: Release
     runs-on: ubuntu-latest

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -10,12 +10,23 @@ jobs:
       name: Run test suite
       runs-on: ubuntu-latest
       steps:
+      - name: Set up Go
+        uses: actions/setup-go@v2
+        with:
+          go-version: '1.20.0'
       - name: Checkout code
         uses: actions/checkout@v4
+      - name: Install dependencies
+        run: |
+          go mod download
       - name: Build and run tests
         run: make run-tests
-      - name: print
-        run: ls -la
+      - name: Install goveralls
+        run: go install github.com/mattn/goveralls@latest
+      - name: Send coverage
+        env:
+          COVERALLS_TOKEN: ${{ secrets.COVERALLS_TOKEN }}
+        run: goveralls -coverprofile=./testTarget/unit/cover.out -service=github
       - uses: actions/upload-artifact@v4
         with:
           name: libtokenizers.a

diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,4 @@
 .idea
 models
 testTarget
+artifacts
diff --git a/Dockerfile b/Dockerfile
@@ -10,7 +10,7 @@ RUN git clone https://github.com/knights-analytics/tokenizers -b main && \
     cd tokenizers && \
     cargo build --release
 
-#--- build layer ---
+#--- build and test layer ---
 
 FROM public.ecr.aws/amazonlinux/amazonlinux:2023 AS building
 ARG GO_VERSION
@@ -37,41 +37,17 @@ RUN GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -o test2json -ldflags="-s -w"
     curl -LO https://github.com/gotestyourself/gotestsum/releases/download/v1.11.0/gotestsum_1.11.0_linux_amd64.tar.gz && \
     tar -xzf gotestsum_1.11.0_linux_amd64.tar.gz --directory /usr/local/bin
 
+COPY ./models /models
+
+# build cli
 COPY . /build
 WORKDIR /build
-RUN go mod download && CGO_ENABLED=1 GOOS=linux GOARCH=amd64 && \
-    mkdir /unittest && go test -c . -o /unittest/pipelines.test && \
-    go clean -r -cache -testcache -modcache
-
-# cli build
 RUN cd ./cmd && CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -a -o ./target main.go
 
-COPY ./models /models
-
 # NON-PRIVILEDGED USER
 # create non-priviledged testuser with id: 1000
 RUN dnf install --disablerepo=* --enablerepo=amazonlinux --allowerasing -y dirmngr && dnf clean all
-RUN useradd -u 1000 -m testuser && chown -R testuser:testuser /unittest
-
-#--- test layer
-
-FROM public.ecr.aws/amazonlinux/amazonlinux:2023 AS testing
-
-RUN dnf install --disablerepo=* --enablerepo=amazonlinux --allowerasing -y dirmngr && dnf clean all
-
-COPY --from=building /usr/lib64/onnxruntime.so /usr/lib64/onnxruntime.so
-COPY --from=building /usr/lib/libtokenizers.a /usr/lib/libtokenizers.a
-COPY --from=building /unittest /unittest
-COPY --from=building /usr/local/bin/test2json /usr/local/bin/test2json
-COPY --from=building /usr/local/bin/gotestsum /usr/local/bin/gotestsum
-COPY --from=building /models /models
-COPY --from=building /build/cmd/target /usr/local/bin/hugot
-
-ENV GOVERSION=$GO_VERSION
-
-# NON-PRIVILEDGED USER
-# create non-priviledged testuser with id: 1000
-RUN useradd -u 1000 -m testuser && chown -R testuser:testuser /unittest
+RUN useradd -u 1000 -m testuser
 
 # ENTRYPOINT
 COPY ./scripts/entrypoint.sh /entrypoint.sh

diff --git a/README.md b/README.md
@@ -1,39 +1,62 @@
 # <span>Hugot: Huggingface 🤗 pipelines for golang
 
+[![Go Reference](https://pkg.go.dev/badge/github.com/knights-analytics/hugot.svg)](https://pkg.go.dev/github.com/knights-analytics/hugot)
+[![Go Report Card](https://goreportcard.com/badge/github.com/knights-analytics/hugot)](https://goreportcard.com/report/github.com/knights-analytics/hugot)
+[![Coverage Status](https://coveralls.io/repos/github/knights-analytics/hugot/badge.svg?branch=main)](https://coveralls.io/github/knights-analytics/hugot?branch=main)
+
 ## What
 
-This library aims to provide an easy, scalable, and hassle-free way to run huggingface transformer pipelines in golang applications. It is built on the following principles:
+The goal of this library is to provide an easy, scalable, and hassle-free way to run huggingface transformer pipelines in golang applications. It is built on the following principles:
 
-1. Fidelity to the original Huggingface python implementations: we aim to accurately replicate huggingface inference implementations for the implemented pipelines, so that models trained and tested in python can be seamlessly deployed in golang
-2. Hassle-free and performant production use: we exclusively support onnx exports of huggingface models. Huggingface transformer models can be easily exported to onnx via huggingface optimum and used with the library (see instructions below)
-3. Run on your hardware: the aim is to be able to run onnx-exported huggingface transformer models on local hardware rather than relying on the http huggingface API
+1. Fidelity to the original Huggingface python implementations: the aim is to accurately replicate huggingface inference implementations for the implemented pipelines, so that models trained and tested in python can be seamlessly deployed in a golang application
+2. Hassle-free and performant production use: we exclusively support onnx exports of huggingface models. Pytorch transformer models that don't have an onnx version can be easily exported to onnx via [huggingface optimum](https://huggingface.co/docs/optimum/index), and used with the library
+3. Run on your hardware: this library is for those who want to run transformer models tightly coupled with their go applications, without the performance drawbacks of having to hit a rest API, or the hassle of setting up and maintaining e.g. a python RPC service that talks to go.
 
 ## Why
 
-While developing and fine-tuning transformer models with the huggingface python library is a great experience, if your production stack is golang-based being able to reliably deploy and scale the resulting pytorch models can be challenging. This library aims to make the process easy.
+Developing and fine-tuning transformer models with the huggingface python library is a great experience, but if your production stack is golang-based being able to reliably deploy and scale the resulting pytorch models can be challenging and require quite some setup. This library aims to allow you to just lift-and-shift your python model and use the same huggingface pipelines you use for development for inference in a go application.
 
 ## For whom
 
-For the golang developer or ML engineer who wants to run transformer piplines at scale on their own hardware for their application
+For the golang developer or ML engineer who wants to run transformer piplines on their own hardware, tightly coupled with their own application.
 
 ## What is already there
 
-We currently have implementations for the following three transfomer pipelines:
+Currently we have implementations for the following transfomer pipelines:
 
 - [featureExtraction](https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.FeatureExtractionPipeline)
 - [textClassification](https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.TextClassificationPipeline) (single label classification only)
 - [tokenClassification](https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.TokenClassificationPipeline)
 
-Implementations for additional pipelines will follow. We also very gladly accept PRs to expand the set of pipelines! See [here](https://huggingface.co/docs/transformers/en/main_classes/pipelines) for the missing pipelines that can be implemented.
+Implementations for additional pipelines will follow. We also very gladly accept PRs to expand the set of pipelines! See [here](https://huggingface.co/docs/transformers/en/main_classes/pipelines) for the missing pipelines that can be implemented, and the contributing section below if you want to lend a hand.
+
+Hugot can be used both as a library and as a command-line application. See below for usage instructions.
+
+## Limitations
+
+Apart from the fact that only the aforementioned pipelines are currently implemented, the current limitations are:
+    - the library and cli are only tested on amd64-linux
+    - only CPU inference is supported
+
+Pipelines are also tested on specifically NLP use cases. In particular, we use the following models for testing:
+- feature extraction: all-MiniLM-L6-v2
+- text classification: distilbert-base-uncased-finetuned-sst-2-english
+- token classification: distilbert-NER
+
+If you encounter any further issues or want further features, please open an issue.
 
 ## Installation and usage
 
-Hugot has two main dependencies:
+Hugot can be used in two ways: as a library in your go application, or as a command-line binary.
+
+### Use it as a library
 
-- the [tokenizer](https://github.com/Knights-Analytics/tokenizers) library with bindings to huggingface's rust tokenizer, which is itself a fork of https://github.com/daulet/tokenizers. In particular, you will need to make available to Hugot the compiled libtokenizers.a file, which resides by default at /usr/lib/libtokenizers.a.
-- the [onnxruntime_go](https://github.com/yalue/onnxruntime_go) library, with go bindings to onnxruntime. You will need to make available to Hugot the onnxruntime.so file, which resides by default at /usr/lib/onnxruntime.so
+To use Hugot as a library in your application, you will need the following dependencies on your system:
 
-Assuming you have rust installed, you can compile the tokenizers library and get the required libtokenizers.a as simply as follows:
+- the tokenizers.a file obtained from building the [tokenizer](https://github.com/Knights-Analytics/tokenizers) go library (which is itself a fork of https://github.com/daulet/tokenizers). This file should be at /usr/lib/tokenizers.a so that hugot can load it.
+- the onnxruntime.go file obtained from the onnxruntime project. This is dynamically linked by hugot and used by the onnxruntime inference library[onnxruntime_go](https://github.com/yalue/onnxruntime_go). This file should be at /usr/lib/onnxruntime.so or /usr/lib64/onnxruntime.so
+
+You can get the libtokenizers.a in two ways. Assuming you have rust installed, you can compile the tokenizers library and get the required libtokenizers.a:
 
 ```
 git clone https://github.com/Knights-Analytics/tokenizers -b main && \
@@ -52,10 +75,137 @@ curl -LO https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTI
    mv ./onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}/lib/libonnxruntime.so.${ONNXRUNTIME_VERSION} /usr/lib/onnxruntime.so
 ```
 
-See also the dev/test [dockerfile](./Dockerfile).
+See also the [dockerfile](./Dockerfile) used for building & testing.
 
 Once these pieces are in place, the library can be used as follows:
 
+```go
+import (
+	"github.com/knights-analytics/hugot"
+	"github.com/knights-analytics/hugot/pipelines"
+)
+
+// start a new session. This looks for the onnxruntime.so library in its default path, e.g. /usr/lib/onnxruntime.so
+session, err := hugot.NewSession()
+// if your onnxruntime.so is somewhere else, you can explicitly set it by using WithOnnxLibraryPath
+// session, err := hugot.NewSession(WithOnnxLibraryPath("/path/to/onnxruntime.so"))
+check(err)
+// A successfully created hugot session needs to be destroyed when you're done
+defer func(session *hugot.Session) {
+    err := session.Destroy()
+    check(err)
+}(session)
+// we now create a text classification pipeline. It requires the path to the onnx model folder,
+// and a pipeline name
+sentimentPipeline, err := session.NewTextClassificationPipeline(modelPath, "testPipeline")
+check(err)
+// we can now use the pipeline for prediction on a batch of strings
+batch := []string{"This movie is disgustingly good !", "The director tried too much"}
+batchResult, err := sentimentPipeline.Run(batch)
+check(err)
+// batchResult is an interface so that we can treat pipelines uniformly.
+// we can cast it to the concrete result type of this pipeline
+result, ok := batchResult.(*pipelines.TextClassificationOutput)
+// and do whatever we want with it :)
+s, err := json.Marshal(result)
+check(err)
+fmt.Println(string(s))
+// {"ClassificationOutputs":[[{"Label":"POSITIVE","Score":0.9998536}],[{"Label":"NEGATIVE","Score":0.99752176}]]}
+```
+
+See also hugot_test.go for further examples.
+
+### Use it as a cli: Huggingface 🤗 pipelines from the command line
+
+With hugot you don't need python, pytorch, or even go to run huggingface transformers. Simply install the hugot cli (alpha):
+
+```
+bash <(curl -s https://github.com/knights-analytics/hugot/blob/main/scripts/install-hugot-cli.sh)
+```
+
+This will install the hugot binary at $HOME/.local/bin/hugot, and the corresponding onnxruntime.so library at $HOME/lib/hugot/onnxruntime.so.
+The if $HOME/.local/bin is on your $PATH, you can do:
+
+```
+hugot run --model=/path/to/onnx/model --input=/path/to/input.jsonl --output=/path/to/folder/output --type=textClassification
+```
+
+Hugot will load the model, process the input, and write the results in the output folder.
+Note that the hugot cli currently expects the input in a specific format: json lines with an "input" key containing the string to process.
+Example:
+
+```
+{"input": "The director tried too much"}
+{"input": "The film was excellent"}
+```
+
+Will produce a file called result_0.jsonl in the output folder with contents:
+
+```
+{"input":"The director tried too much","output":[{"Label":"NEGATIVE","Score":0.99752176}]}
+{"input":"The film was excellent","output":[{"Label":"POSITIVE","Score":0.99986285}]}
+```
+
+Note that if --input is not provided, hugot will read from stdin, and if --output is not provided, it will write to stdout.
+This allows to chain things like:
+
+```
+echo '{"input":"The director tried too much","output":[{"Label":"NEGATIVE","Score":0.99752176}]}' | hugot run --model=/path/to/model --type=textClassification | jq
+```
+
+To be able to run transformers fully from the command line.
+
+## Contributing
+
+### Development environment
+
+The easiest way to contribute to hugot is by developing inside a docker container that has the tokenizer and onnxruntime libraries.
+From the source folder, it should be as easy as:
+
+```bash
+make start-dev-container
+```
+
+which will download the test models, build the test container, and launch it (see [compose-dev](./compose-dev.yaml)), mounting the source code at /home/testuser/repositories/hugot. Then you can attach to the container with e.g. vscode remote extension as testuser. The vscode attached container configuration file can be set to:
+
 ```
-TODO
+{
+    "remoteUser": "testuser",
+    "workspaceFolder": "/home/testuser/repositories/hugot",
+    "extensions": [
+		"bierner.markdown-preview-github-styles",
+		"golang.go",
+		"ms-azuretools.vscode-docker"
+	],
+    "remoteEnv": {"GOPATH": "/home/testuser/go"}
+}
 ```
+
+Once you're done, you can tear the container down with:
+
+```bash
+make stop-dev-container
+```
+
+Alternatively, you can use your IDE devcontainer support, and point it to the [Dockerfile](./Dockerfile).
+
+If you prefer to develop on bare metal, you will need to download the tokenizers.a to /usr/lib/tokenizers.a and onnxruntime.so to /usr/lib/onnxruntime.so.
+
+### Run the tests
+
+The full test suite can be ran as follows. From the source folder:
+
+```bash
+make clean run-tests
+```
+
+This will build a test image and run all tests in a container. A testTarget folder will appear in the source directory with the test results.
+
+### Contribution process
+
+1. create or find an issue for your contribution
+2. fork and develop
+3. add tests and make sure the full test suite passes and test coverage does not dip below 80%
+4. create a MR linking to the relevant issue
+
+Thank you for contributing to hugot!
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,3 +2,4 @@ @@
     .idea
     models
     testTarget
+    artifacts