1
- import { reverse } from "d3-array" ;
1
+ import { greatest , reverse } from "d3-array" ;
2
2
import { FileAttachment } from "./fileAttachment.js" ;
3
3
import { isArqueroTable } from "./arquero.js" ;
4
4
import { isArrowTable , loadArrow } from "./arrow.js" ;
@@ -66,13 +66,20 @@ function objectHasEnumerableKeys(value) {
66
66
}
67
67
68
68
function isQueryResultSetSchema ( schemas ) {
69
- return ( Array . isArray ( schemas ) && schemas . every ( ( s ) => s && typeof s . name === "string" ) ) ;
69
+ return (
70
+ Array . isArray ( schemas ) &&
71
+ schemas . every ( isColumnSchema )
72
+ ) ;
70
73
}
71
74
72
75
function isQueryResultSetColumns ( columns ) {
73
76
return ( Array . isArray ( columns ) && columns . every ( ( name ) => typeof name === "string" ) ) ;
74
77
}
75
78
79
+ function isColumnSchema ( schema ) {
80
+ return schema && typeof schema . name === "string" && typeof schema . type === "string" ;
81
+ }
82
+
76
83
// Returns true if the value represents an array of primitives (i.e., a
77
84
// single-column table). This should only be passed values for which
78
85
// isDataArray returns true.
@@ -191,15 +198,17 @@ function sourceCache(loadSource) {
191
198
const loadTableDataSource = sourceCache ( async ( source , name ) => {
192
199
if ( source instanceof FileAttachment ) {
193
200
switch ( source . mimeType ) {
194
- case "text/csv" : return source . csv ( { typed : true } ) ;
195
- case "text/tab-separated-values" : return source . tsv ( { typed : true } ) ;
201
+ case "text/csv" : return source . csv ( ) ;
202
+ case "text/tab-separated-values" : return source . tsv ( ) ;
196
203
case "application/json" : return source . json ( ) ;
197
204
case "application/x-sqlite3" : return source . sqlite ( ) ;
198
205
}
199
206
if ( / \. ( a r r o w | p a r q u e t ) $ / i. test ( source . name ) ) return loadDuckDBClient ( source , name ) ;
200
207
throw new Error ( `unsupported file type: ${ source . mimeType } ` ) ;
201
208
}
202
209
if ( isArrowTable ( source ) || isArqueroTable ( source ) ) return loadDuckDBClient ( source , name ) ;
210
+ if ( isDataArray ( source ) && arrayIsPrimitive ( source ) )
211
+ return Array . from ( source , ( value ) => ( { value} ) ) ;
203
212
return source ;
204
213
} ) ;
205
214
@@ -542,15 +551,84 @@ export function getTypeValidator(colType) {
542
551
}
543
552
}
544
553
554
+ // Accepts dates in the form of ISOString and LocaleDateString, with or without time
555
+ const DATE_TEST = / ^ ( ( [ - + ] \d { 2 } ) ? \d { 4 } ( - \d { 2 } ( - \d { 2 } ) ) | ( \d { 1 , 2 } ) \/ ( \d { 1 , 2 } ) \/ ( \d { 2 , 4 } ) ) ( [ T ] \d { 2 } : \d { 2 } ( : \d { 2 } ( \. \d { 3 } ) ? ) ? ( Z | [ - + ] \d { 2 } : \d { 2 } ) ? ) ? $ / ;
556
+
557
+ export function coerceToType ( value , type ) {
558
+ switch ( type ) {
559
+ case "string" :
560
+ return typeof value === "string" || value == null ? value : String ( value ) ;
561
+ case "boolean" :
562
+ if ( typeof value === "string" ) {
563
+ const trimValue = value . trim ( ) . toLowerCase ( ) ;
564
+ return trimValue === "true"
565
+ ? true
566
+ : trimValue === "false"
567
+ ? false
568
+ : null ;
569
+ }
570
+ return typeof value === "boolean" || value == null
571
+ ? value
572
+ : Boolean ( value ) ;
573
+ case "bigint" :
574
+ return typeof value === "bigint" || value == null
575
+ ? value
576
+ : Number . isInteger ( typeof value === "string" && ! value . trim ( ) ? NaN : + value )
577
+ ? BigInt ( value ) // eslint-disable-line no-undef
578
+ : undefined ;
579
+ case "integer" : // not a target type for coercion, but can be inferred
580
+ case "number" : {
581
+ return typeof value === "number"
582
+ ? value
583
+ : value == null || ( typeof value === "string" && ! value . trim ( ) )
584
+ ? NaN
585
+ : Number ( value ) ;
586
+ }
587
+ case "date" : {
588
+ if ( value instanceof Date || value == null ) return value ;
589
+ if ( typeof value === "number" ) return new Date ( value ) ;
590
+ const trimValue = String ( value ) . trim ( ) ;
591
+ if ( typeof value === "string" && ! trimValue ) return null ;
592
+ return new Date ( DATE_TEST . test ( trimValue ) ? trimValue : NaN ) ;
593
+ }
594
+ case "array" :
595
+ case "object" :
596
+ case "buffer" :
597
+ case "other" :
598
+ return value ;
599
+ default :
600
+ throw new Error ( `Unable to coerce to type: ${ type } ` ) ;
601
+ }
602
+ }
603
+
545
604
// This function applies table cell operations to an in-memory table (array of
546
605
// objects); it should be equivalent to the corresponding SQL query. TODO Use
547
606
// DuckDBClient for data arrays, too, and then we wouldn’t need our own __table
548
607
// function to do table operations on in-memory data?
549
608
export function __table ( source , operations ) {
550
609
const input = source ;
551
610
let { schema, columns} = source ;
552
- let primitive = arrayIsPrimitive ( source ) ;
553
- if ( primitive ) source = Array . from ( source , ( value ) => ( { value} ) ) ;
611
+ let inferredSchema = false ;
612
+ if ( ! isQueryResultSetSchema ( schema ) ) {
613
+ schema = inferSchema ( source , columns ) ;
614
+ inferredSchema = true ;
615
+ }
616
+ // Combine column types from schema with user-selected types in operations
617
+ const types = new Map ( schema . map ( ( { name, type} ) => [ name , type ] ) ) ;
618
+ if ( operations . type ) {
619
+ for ( const { name, type} of operations . type ) {
620
+ types . set ( name , type ) ;
621
+ // update schema with user-selected type
622
+ if ( schema === input . schema ) schema = schema . slice ( ) ; // copy on write
623
+ const colIndex = schema . findIndex ( ( col ) => col . name === name ) ;
624
+ if ( colIndex > - 1 ) schema [ colIndex ] = { ...schema [ colIndex ] , type} ;
625
+ }
626
+ source = source . map ( d => coerceRow ( d , types , schema ) ) ;
627
+ } else if ( inferredSchema ) {
628
+ // Coerce data according to new schema, unless that happened due to
629
+ // operations.type, above.
630
+ source = source . map ( d => coerceRow ( d , types , schema ) ) ;
631
+ }
554
632
for ( const { type, operands} of operations . filter ) {
555
633
const [ { value : column } ] = operands ;
556
634
const values = operands . slice ( 1 ) . map ( ( { value} ) => value ) ;
@@ -663,7 +741,7 @@ export function __table(source, operations) {
663
741
Object . fromEntries ( operations . select . columns . map ( ( c ) => [ c , d [ c ] ] ) )
664
742
) ;
665
743
}
666
- if ( ! primitive && operations . names ) {
744
+ if ( operations . names ) {
667
745
const overridesByName = new Map ( operations . names . map ( ( n ) => [ n . column , n ] ) ) ;
668
746
if ( schema ) {
669
747
schema = schema . map ( ( s ) => {
@@ -684,10 +762,120 @@ export function __table(source, operations) {
684
762
} ) )
685
763
) ;
686
764
}
687
- if ( primitive ) source = source . map ( ( d ) => d . value ) ;
688
765
if ( source !== input ) {
689
766
if ( schema ) source . schema = schema ;
690
767
if ( columns ) source . columns = columns ;
691
768
}
692
769
return source ;
693
770
}
771
+
772
+ function coerceRow ( object , types , schema ) {
773
+ const coerced = { } ;
774
+ for ( const col of schema ) {
775
+ const type = types . get ( col . name ) ;
776
+ const value = object [ col . name ] ;
777
+ coerced [ col . name ] = type === "raw" ? value : coerceToType ( value , type ) ;
778
+ }
779
+ return coerced ;
780
+ }
781
+
782
+ function createTypeCount ( ) {
783
+ return {
784
+ boolean : 0 ,
785
+ integer : 0 ,
786
+ number : 0 ,
787
+ date : 0 ,
788
+ string : 0 ,
789
+ array : 0 ,
790
+ object : 0 ,
791
+ bigint : 0 ,
792
+ buffer : 0 ,
793
+ defined : 0
794
+ } ;
795
+ }
796
+
797
+ // Caution: the order below matters! 🌶️ The first one that passes the ≥90% test
798
+ // should be the one that we chose, and therefore these types should be listed
799
+ // from most specific to least specific.
800
+ const types = [
801
+ "boolean" ,
802
+ "integer" ,
803
+ "number" ,
804
+ "date" ,
805
+ "bigint" ,
806
+ "array" ,
807
+ "object" ,
808
+ "buffer"
809
+ // Note: "other" and "string" are intentionally omitted; see below!
810
+ ] ;
811
+
812
+ // We need to show *all* keys present in the array of Objects
813
+ function getAllKeys ( rows ) {
814
+ const keys = new Set ( ) ;
815
+ for ( const row of rows ) {
816
+ // avoid crash if row is null or undefined
817
+ if ( row ) {
818
+ // only enumerable properties
819
+ for ( const key in row ) {
820
+ // only own properties
821
+ if ( Object . prototype . hasOwnProperty . call ( row , key ) ) {
822
+ // unique properties, in the order they appear
823
+ keys . add ( key ) ;
824
+ }
825
+ }
826
+ }
827
+ }
828
+ return Array . from ( keys ) ;
829
+ }
830
+
831
+ export function inferSchema ( source , columns = getAllKeys ( source ) ) {
832
+ const schema = [ ] ;
833
+ const sampleSize = 100 ;
834
+ const sample = source . slice ( 0 , sampleSize ) ;
835
+ const typeCounts = { } ;
836
+ for ( const col of columns ) {
837
+ const colCount = typeCounts [ col ] = createTypeCount ( ) ;
838
+ for ( const d of sample ) {
839
+ let value = d [ col ] ;
840
+ if ( value == null ) continue ;
841
+ const type = typeof value ;
842
+ if ( type !== "string" ) {
843
+ ++ colCount . defined ;
844
+ if ( Array . isArray ( value ) ) ++ colCount . array ;
845
+ else if ( value instanceof Date ) ++ colCount . date ;
846
+ else if ( value instanceof ArrayBuffer ) ++ colCount . buffer ;
847
+ else if ( type === "number" ) {
848
+ ++ colCount . number ;
849
+ if ( Number . isInteger ( value ) ) ++ colCount . integer ;
850
+ }
851
+ // bigint, boolean, or object
852
+ else if ( type in colCount ) ++ colCount [ type ] ;
853
+ } else {
854
+ value = value . trim ( ) ;
855
+ if ( ! value ) continue ;
856
+ ++ colCount . defined ;
857
+ ++ colCount . string ;
858
+ if ( / ^ ( t r u e | f a l s e ) $ / i. test ( value ) ) {
859
+ ++ colCount . boolean ;
860
+ } else if ( value && ! isNaN ( value ) ) {
861
+ ++ colCount . number ;
862
+ if ( Number . isInteger ( + value ) ) ++ colCount . integer ;
863
+ } else if ( DATE_TEST . test ( value ) ) ++ colCount . date ;
864
+ }
865
+ }
866
+ // Chose the non-string, non-other type with the greatest count that is also
867
+ // ≥90%; or if no such type meets that criterion, fallback to string if
868
+ // ≥90%; and lastly fallback to other.
869
+ const minCount = Math . max ( 1 , colCount . defined * 0.9 ) ;
870
+ const type =
871
+ greatest ( types , ( type ) =>
872
+ colCount [ type ] >= minCount ? colCount [ type ] : NaN
873
+ ) ?? ( colCount . string >= minCount ? "string" : "other" ) ;
874
+ schema . push ( {
875
+ name : col ,
876
+ type : type ,
877
+ inferred : type
878
+ } ) ;
879
+ }
880
+ return schema ;
881
+ }
0 commit comments