Skip to content

Commit 4b37984

Browse files
authored
feat: add dataset tools and libraries (#1)
Signed-off-by: Grant Linville <[email protected]>
1 parent bd3822d commit 4b37984

22 files changed

+785
-0
lines changed

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
test.gpt
2+
bin/
3+
.idea/
4+
.vscode/

Makefile

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
.PHONY: build
2+
build:
3+
CGO_ENABLED=0 go build -o bin/gptscript-go-tool -tags "${GO_TAGS}" -ldflags "-s -w" .

go.mod

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
module github.com/gptscript-ai/datasets
2+
3+
go 1.23.2
4+
5+
require github.com/stretchr/testify v1.9.0
6+
7+
require (
8+
github.com/davecgh/go-spew v1.1.1 // indirect
9+
github.com/pmezard/go-difflib v1.0.0 // indirect
10+
gopkg.in/yaml.v3 v3.0.1 // indirect
11+
)

go.sum

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
2+
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
3+
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
4+
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
5+
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
6+
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
7+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
8+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
9+
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
10+
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

main.go

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
package main
2+
3+
import (
4+
"fmt"
5+
"os"
6+
7+
"github.com/gptscript-ai/datasets/pkg/tools"
8+
)
9+
10+
func main() {
11+
if len(os.Args) < 2 {
12+
fmt.Println(`usage: gptscript-go-tool <command>
13+
subcommands: listDatasets, listElements, getElement, createDataset, addElement
14+
env vars: GPTSCRIPT_WORKSPACE_DIR`)
15+
}
16+
17+
workspace := os.Getenv("GPTSCRIPT_WORKSPACE_DIR")
18+
if workspace == "" {
19+
fmt.Println("missing GPTSCRIPT_WORKSPACE_DIR")
20+
os.Exit(1)
21+
}
22+
23+
switch os.Args[1] {
24+
case "listDatasets":
25+
tools.ListDatasets(workspace)
26+
case "listElements":
27+
tools.ListElements(workspace, os.Getenv("DATASETID"))
28+
case "getElement":
29+
tools.GetElement(workspace, os.Getenv("DATASETID"), os.Getenv("ELEMENT"))
30+
case "createDataset":
31+
tools.CreateDataset(workspace, os.Getenv("DATASETNAME"), os.Getenv("DATASETDESCRIPTION"))
32+
case "addElement":
33+
tools.AddElement(workspace, os.Getenv("DATASETID"), os.Getenv("ELEMENTNAME"), os.Getenv("ELEMENTDESCRIPTION"), []byte(os.Getenv("ELEMENTCONTENT")))
34+
default:
35+
fmt.Printf("unknown command: %s\n", os.Args[1])
36+
os.Exit(1)
37+
}
38+
}

pkg/dataset/dataset.go

+110
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
package dataset
2+
3+
import (
4+
"encoding/json"
5+
"fmt"
6+
"os"
7+
"path/filepath"
8+
9+
"github.com/gptscript-ai/datasets/pkg/util"
10+
)
11+
12+
type ElementMeta struct {
13+
Name string `json:"name"`
14+
Description string `json:"description"`
15+
}
16+
17+
type Element struct {
18+
ElementMeta `json:",inline"`
19+
File string `json:"file"`
20+
}
21+
22+
type DatasetMeta struct {
23+
ID string `json:"id"`
24+
Name string `json:"name"`
25+
Description string `json:"description"`
26+
}
27+
28+
type Dataset struct {
29+
DatasetMeta `json:",inline"`
30+
BaseDir string `json:"baseDir,omitempty"`
31+
Elements map[string]Element `json:"elements"`
32+
}
33+
34+
func (d *Dataset) GetID() string {
35+
return d.ID
36+
}
37+
38+
func (d *Dataset) GetName() string {
39+
return d.Name
40+
}
41+
42+
func (d *Dataset) GetDescription() string {
43+
return d.Description
44+
}
45+
46+
func (d *Dataset) GetLength() int {
47+
return len(d.Elements)
48+
}
49+
50+
func (d *Dataset) ListElements() []ElementMeta {
51+
var elements []ElementMeta
52+
for _, element := range d.Elements {
53+
elements = append(elements, element.ElementMeta)
54+
}
55+
return elements
56+
}
57+
58+
func (d *Dataset) GetElement(name string) ([]byte, Element, error) {
59+
e, exists := d.Elements[name]
60+
if !exists {
61+
return nil, Element{}, fmt.Errorf("element %s not found", name)
62+
}
63+
64+
contents, err := os.ReadFile(d.BaseDir + string(os.PathSeparator) + e.File)
65+
if err != nil {
66+
return nil, Element{}, fmt.Errorf("failed to read element %s: %w", name, err)
67+
}
68+
69+
return contents, e, nil
70+
}
71+
72+
func (d *Dataset) AddElement(name, description string, contents []byte) (Element, error) {
73+
if _, exists := d.Elements[name]; exists {
74+
return Element{}, fmt.Errorf("element %s already exists", name)
75+
}
76+
77+
fileName, err := util.EnsureUniqueFilename(d.BaseDir, util.ToFileName(name))
78+
if err != nil {
79+
return Element{}, fmt.Errorf("failed to generate unique file name: %w", err)
80+
}
81+
82+
loc := filepath.Join(d.BaseDir, fileName)
83+
if err := os.WriteFile(loc, contents, 0644); err != nil {
84+
return Element{}, fmt.Errorf("failed to write element %s: %w", name, err)
85+
}
86+
87+
e := Element{
88+
ElementMeta: ElementMeta{
89+
Name: name,
90+
Description: description,
91+
},
92+
File: fileName,
93+
}
94+
95+
d.Elements[name] = e
96+
return e, d.save()
97+
}
98+
99+
func (d *Dataset) save() error {
100+
datasetJSON, err := json.Marshal(d)
101+
if err != nil {
102+
return fmt.Errorf("failed to marshal dataset: %w", err)
103+
}
104+
105+
if err := os.WriteFile(d.BaseDir+extension, datasetJSON, 0644); err != nil {
106+
return fmt.Errorf("failed to write dataset file: %w", err)
107+
}
108+
109+
return nil
110+
}

pkg/dataset/dataset_test.go

+110
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
package dataset
2+
3+
import (
4+
"os"
5+
"path/filepath"
6+
"testing"
7+
8+
"github.com/stretchr/testify/require"
9+
)
10+
11+
const testWorkspace = "testworkspace"
12+
13+
func TestDatasetsRead(t *testing.T) {
14+
wd, err := os.Getwd()
15+
require.NoError(t, err)
16+
17+
workspaceDir := filepath.Join(wd, testWorkspace)
18+
m, err := NewManager(workspaceDir)
19+
require.NoError(t, err)
20+
21+
datasetMetas, err := m.ListDatasets()
22+
require.NoError(t, err)
23+
require.Len(t, datasetMetas, 2)
24+
25+
datasetOne, err := m.GetDataset("one")
26+
require.NoError(t, err)
27+
require.Equal(t, "one", datasetOne.GetName())
28+
require.Equal(t, "The first test dataset", datasetOne.GetDescription())
29+
require.Equal(t, 2, datasetOne.GetLength())
30+
31+
oneMetas := datasetOne.ListElements()
32+
require.Len(t, oneMetas, 2)
33+
34+
oneOneBytes, _, err := datasetOne.GetElement("file1")
35+
require.NoError(t, err)
36+
require.Equal(t, "This is dataset 1, file 1.\n", string(oneOneBytes))
37+
38+
oneTwoBytes, _, err := datasetOne.GetElement("file2")
39+
require.NoError(t, err)
40+
require.Equal(t, "This is dataset 1, file 2.\n", string(oneTwoBytes))
41+
42+
datasetTwo, err := m.GetDataset("two")
43+
require.NoError(t, err)
44+
require.Equal(t, "two", datasetTwo.GetName())
45+
require.Equal(t, "The second test dataset", datasetTwo.GetDescription())
46+
require.Equal(t, 2, datasetTwo.GetLength())
47+
48+
twoMetas := datasetTwo.ListElements()
49+
require.Len(t, twoMetas, 2)
50+
51+
twoOneBytes, _, err := datasetTwo.GetElement("file1")
52+
require.NoError(t, err)
53+
require.Equal(t, "This is dataset 2, file 1.\n", string(twoOneBytes))
54+
55+
twoTwoBytes, _, err := datasetTwo.GetElement("file2")
56+
require.NoError(t, err)
57+
require.Equal(t, "This is dataset 2, file 2.\n", string(twoTwoBytes))
58+
}
59+
60+
func TestDatasetWrite(t *testing.T) {
61+
wd, err := os.Getwd()
62+
require.NoError(t, err)
63+
64+
workspaceDir := filepath.Join(wd, testWorkspace)
65+
m, err := NewManager(workspaceDir)
66+
require.NoError(t, err)
67+
68+
t.Cleanup(func() {
69+
threeFiles, _ := filepath.Glob(filepath.Join(workspaceDir, "datasets", "three", "*"))
70+
71+
for _, file := range threeFiles {
72+
_ = os.Remove(file)
73+
}
74+
75+
_ = os.Remove(filepath.Join(workspaceDir, "datasets", "three"))
76+
_ = os.Remove(filepath.Join(workspaceDir, "datasets", "three.dataset.json"))
77+
})
78+
79+
datasetThree, err := m.NewDataset("three", "The third test dataset")
80+
require.NoError(t, err)
81+
require.Equal(t, "three", datasetThree.GetName())
82+
require.Equal(t, "The third test dataset", datasetThree.GetDescription())
83+
require.Equal(t, 0, datasetThree.GetLength())
84+
85+
// Let's add a couple elements.
86+
_, err = datasetThree.AddElement("file1", "The first file", []byte("This is dataset 3, file 1.\n"))
87+
require.NoError(t, err)
88+
require.Equal(t, 1, datasetThree.GetLength())
89+
90+
_, err = datasetThree.AddElement("file2", "The second file", []byte("This is dataset 3, file 2.\n"))
91+
require.NoError(t, err)
92+
require.Equal(t, 2, datasetThree.GetLength())
93+
94+
// Let's read it back.
95+
datasetThree, err = m.GetDataset(datasetThree.GetID())
96+
require.NoError(t, err)
97+
require.Equal(t, "three", datasetThree.GetName())
98+
require.Equal(t, "The third test dataset", datasetThree.GetDescription())
99+
100+
threeMetas := datasetThree.ListElements()
101+
require.Len(t, threeMetas, 2)
102+
103+
threeOneBytes, _, err := datasetThree.GetElement("file1")
104+
require.NoError(t, err)
105+
require.Equal(t, "This is dataset 3, file 1.\n", string(threeOneBytes))
106+
107+
threeTwoBytes, _, err := datasetThree.GetElement("file2")
108+
require.NoError(t, err)
109+
require.Equal(t, "This is dataset 3, file 2.\n", string(threeTwoBytes))
110+
}

0 commit comments

Comments
 (0)