Skip to content

Commit cfd934b

Browse files
authored
Optionally convert numbers in CSV files (#65)
* Bump datastation/runner for new `ContentTypeInfo.ConvertNumbers` flag * Expose `convertNumbers` flag as command line argument * Add documentation for the `--convert-numbers` flag
1 parent 6dfe57c commit cfd934b

File tree

4 files changed

+64
-14
lines changed

4 files changed

+64
-14
lines changed

README.md

+41
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,47 @@ dsq> SELECT * FROM {} WHERE NAME = 'Kevin';
481481
(0 rows)
482482
```
483483
484+
### Converting numbers in CSV and TSV files
485+
486+
CSV and TSV files do not allow to specify the type of the individual
487+
values contained in them. All values are treated as strings by default.
488+
489+
This can lead to unexpected results in queries. Consider the following
490+
example:
491+
492+
```
493+
$ cat scores.csv
494+
name,score
495+
Fritz,90
496+
Rainer,95.2
497+
Fountainer,100
498+
499+
$ dsq scores.csv "SELECT * FROM {} ORDER BY score"
500+
[{"name":"Fountainer","score":"100"},
501+
{"name":"Fritz","score":"90"},
502+
{"name":"Rainer","score":"95.2"}]
503+
```
504+
505+
Note how the `score` column contains numerical values only. Still,
506+
sorting by that column yields unexpected results because the values are
507+
treated as strings, and sorted lexically. (You can tell that the
508+
individual scores were imported as strings because they're quoted in the
509+
JSON result.)
510+
511+
Use the `-n` or `--convert-numbers` flag to auto-detect and convert
512+
numerical values (integers and floats) in imported files:
513+
514+
```
515+
$ dsq ~/scores.csv --convert-numbers "SELECT * FROM {} ORDER BY score"
516+
[{"name":"Fritz","score":90},
517+
{"name":"Rainer","score":95.2},
518+
{"name":"Fountainer","score":100}]
519+
```
520+
521+
Note how the scores are imported as numbers now and how the records in
522+
the result set are sorted by their numerical value. Also note that the
523+
individual scores are no longer quoted in the JSON result.
524+
484525
## Supported Data Types
485526
486527
| Name | File Extension(s) | Mime Type | Notes |

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ go 1.18
88
require (
99
github.com/chzyer/readline v1.5.0
1010
github.com/google/uuid v1.3.0
11-
github.com/multiprocessio/datastation/runner v0.0.0-20220601032709-9bda16b723bb
11+
github.com/multiprocessio/datastation/runner v0.0.0-20220609232347-405d8c1a88b2
1212
github.com/olekukonko/tablewriter v0.0.5
1313
)
1414

go.sum

+2
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,8 @@ github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 h1:RWengNIwukTxcDr9
447447
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826/go.mod h1:TaXosZuwdSHYgviHp1DAtfrULt5eUgsSMsZf+YrPgl8=
448448
github.com/multiprocessio/datastation/runner v0.0.0-20220601032709-9bda16b723bb h1:sG23Q6XOfcOtK9bM4QhcmGiqsFVzoXwkZRvb8OJ3EiU=
449449
github.com/multiprocessio/datastation/runner v0.0.0-20220601032709-9bda16b723bb/go.mod h1:UCms/xK08DspNqDDZ5XsaIlc39AuREmeELspFYghMGI=
450+
github.com/multiprocessio/datastation/runner v0.0.0-20220609232347-405d8c1a88b2 h1:WWCPwJPWfBVUhuAfFZJGs6vxemeeqW8ahDRtTtbGyxw=
451+
github.com/multiprocessio/datastation/runner v0.0.0-20220609232347-405d8c1a88b2/go.mod h1:UCms/xK08DspNqDDZ5XsaIlc39AuREmeELspFYghMGI=
450452
github.com/multiprocessio/go-json v0.0.0-20220308002443-61d497dd7b9e h1:NlPl7amllnQyVAkZgjBvFEkKxJSba/R8ZpaTodc7SIQ=
451453
github.com/multiprocessio/go-json v0.0.0-20220308002443-61d497dd7b9e/go.mod h1:huI4M/MrI5px/SgmXYi0a2byKikSLgDrnMQuXOqKtw4=
452454
github.com/multiprocessio/go-openoffice v0.0.0-20220110232726-064f5dda1956 h1:WVofL03Eq+z3LbDOfH5eKzu2U85LFZZngOMBlNaO/H0=

main.go

+20-13
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ func resolveContentType(fileExtensionOrContentType string) runner.MimeType {
4040
return runner.GetMimeType("x."+fileExtensionOrContentType, runner.ContentTypeInfo{})
4141
}
4242

43-
func evalFileInto(file, mimetype string, out *os.File) error {
43+
func evalFileInto(file, mimetype string, convertNumbers bool, out *os.File) error {
4444
if mimetype == "" {
4545
mimetype = string(runner.GetMimeType(file, runner.ContentTypeInfo{}))
4646
} else {
@@ -55,7 +55,8 @@ func evalFileInto(file, mimetype string, out *os.File) error {
5555
defer w.Flush()
5656

5757
return runner.TransformFile(file, runner.ContentTypeInfo{
58-
Type: mimetype,
58+
Type: mimetype,
59+
ConvertNumbers: convertNumbers,
5960
}, w)
6061
}
6162

@@ -213,7 +214,7 @@ func getFilesContentHash(files []string) (string, error) {
213214
return hex.EncodeToString(sha1.Sum(nil)), nil
214215
}
215216

216-
func importFile(projectId string, file, mimetype string, ec runner.EvalContext) (*runner.PanelInfo, error) {
217+
func importFile(projectId string, file, mimetype string, convertNumbers bool, ec runner.EvalContext) (*runner.PanelInfo, error) {
217218
panelId := uuid.New().String()
218219
resultFile := ec.GetPanelResultsFile(projectId, panelId)
219220
out, err := openTruncate(resultFile)
@@ -222,7 +223,7 @@ func importFile(projectId string, file, mimetype string, ec runner.EvalContext)
222223
}
223224
defer out.Close()
224225

225-
if err := evalFileInto(file, mimetype, out); err != nil {
226+
if err := evalFileInto(file, mimetype, convertNumbers, out); err != nil {
226227
return nil, err
227228
}
228229

@@ -333,14 +334,15 @@ func repl(project *runner.ProjectState, ec *runner.EvalContext, args *args, file
333334
}
334335

335336
type args struct {
336-
pipedMimetype string
337-
pretty bool
338-
schema bool
339-
sqlFile string
340-
cacheSettings runner.CacheSettings
341-
nonFlagArgs []string
342-
dumpCacheFile bool
343-
isInteractive bool
337+
pipedMimetype string
338+
pretty bool
339+
schema bool
340+
sqlFile string
341+
cacheSettings runner.CacheSettings
342+
nonFlagArgs []string
343+
dumpCacheFile bool
344+
isInteractive bool
345+
convertNumbers bool
344346
}
345347

346348
func getArgs() (*args, error) {
@@ -415,6 +417,11 @@ func getArgs() (*args, error) {
415417
continue
416418
}
417419

420+
if arg == "-n" || arg == "--convert-numbers" {
421+
args.convertNumbers = true
422+
continue
423+
}
424+
418425
args.nonFlagArgs = append(args.nonFlagArgs, arg)
419426
}
420427

@@ -558,7 +565,7 @@ func _main() error {
558565
// When dumping schema, need to injest even if cache is on.
559566
if !args.cacheSettings.CachePresent || !args.cacheSettings.Enabled || lastNonFlagArg == "" {
560567
for _, file := range files {
561-
panel, err := importFile(project.Id, file, mimetypeOverride[file], ec)
568+
panel, err := importFile(project.Id, file, mimetypeOverride[file], args.convertNumbers, ec)
562569
if err != nil {
563570
return err
564571
}

0 commit comments

Comments
 (0)