Skip to content

Commit

Permalink
Release 7.048
Browse files Browse the repository at this point in the history
  • Loading branch information
cnuernber committed Mar 4, 2025
1 parent f4ee887 commit 69d62ee
Show file tree
Hide file tree
Showing 36 changed files with 80 additions and 77 deletions.
4 changes: 2 additions & 2 deletions deps.edn
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{:paths ["src" "resources" "target/classes"]
:deps {;;org.clojure/clojure {:mvn/version "1.11.1"}
cnuernber/dtype-next {:mvn/version "10.134"}
cnuernber/dtype-next {:mvn/version "10.135" }
techascent/tech.io {:mvn/version "4.31"
:exclusions [org.apache.commons/commons-compress]}
org.apache.datasketches/datasketches-java {:mvn/version "4.2.0"}
Expand All @@ -14,7 +14,7 @@
:exec-fn codox.main/-main
:exec-args {:group-id "techascent"
:artifact-id "tech.ml.dataset"
:version "7.047"
:version "7.048"
:name "TMD"
:description "A Clojure high performance data processing system"
:metadata {:doc/format :markdown}
Expand Down
2 changes: 1 addition & 1 deletion docs/000-getting-started.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/100-walkthrough.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/200-quick-reference.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/columns-readers-and-datatypes.html

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions docs/index.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/nippy-serialization-rocks.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/supported-datatypes.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.categorical.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.clipboard.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.column-filters.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.column.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.io.csv.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.io.datetime.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.io.string-row-parser.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.io.univocity.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.join.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.math.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.metamorph.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.modelling.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.print.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.reductions.apache-data-sketch.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.reductions.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.rolling.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.set.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.tensor.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.dataset.zip.html

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions docs/tech.v3.libs.arrow.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.libs.clj-transit.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.libs.fastexcel.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.libs.guava.cache.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.libs.parquet.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.libs.poi.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tech.v3.libs.tribuo.html

Large diffs are not rendered by default.

71 changes: 37 additions & 34 deletions src/tech/v3/libs/arrow.clj
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,13 @@
[clojure.core.protocols :as clj-proto]
[clojure.datafy :refer [datafy]]
[charred.api :as json]
[ham-fisted.api :as hamf]
[ham-fisted.reduce :as hamf-rf]
[ham-fisted.lazy-noncaching :as lznc]
[ham-fisted.set :as set]
[ham-fisted.protocols :as hamf-proto])
(:import [org.apache.arrow.vector.ipc.message MessageSerializer]
(:import [ham_fisted ArrayLists]
[org.apache.arrow.vector.ipc.message MessageSerializer]
[org.apache.arrow.flatbuf Message DictionaryBatch RecordBatch
FieldNode Buffer BodyCompression BodyCompressionMethod Footer Block]
[org.roaringbitmap RoaringBitmap]
Expand All @@ -118,7 +121,7 @@
ArrowType$LargeUtf8 ArrowType$Null ArrowType$List ArrowType$Binary]
[org.apache.arrow.flatbuf CompressionType]
[org.apache.arrow.vector.types MetadataVersion]
[org.apache.arrow.vector.ipc WriteChannel]
[org.apache.arrow.vector.ipc WriteChannel]
[tech.v3.dataset.string_table StringTable]
[tech.v3.dataset.impl.column Column]
[tech.v3.dataset Text]
Expand Down Expand Up @@ -988,7 +991,7 @@ Dependent block frames are not supported!!")


(defn- compress-record-batch-buffers
[buffers options]
[options buffers]
(if-let [comp-map (get options :compression)]
{:compression-type (compression-kwd->file-type (comp-map :compression-type))
;;parallelize buffer compression
Expand All @@ -1001,7 +1004,7 @@ Dependent block frames are not supported!!")
data-len (byte-array 8)
^ByteBuffer nio-buf (nio-buffer/as-nio-buffer data-len)
data-len-buf (dtype/->buffer data-len)]
(first
(nth
(reduce (fn [[res writer-cache] buffer]
(let [buffer (serialize-to-bytes buffer)
uncomp-len (dtype/ecount buffer)
Expand All @@ -1017,10 +1020,11 @@ Dependent block frames are not supported!!")
[(conj res (nio-buffer/as-nio-buffer dst-buffer))
writer-cache]))
[[] nil]
buffers)))))}
{:buffers (vec (pmap #(-> (serialize-to-bytes %)
(nio-buffer/as-nio-buffer))
buffers))}))
buffers)
0))))}
{:buffers (vec (hamf/pmap #(-> (serialize-to-bytes %)
(nio-buffer/as-nio-buffer))
buffers))}))

(defn- buffers->buf-entries
[buffers]
Expand Down Expand Up @@ -1055,7 +1059,7 @@ Dependent block frames are not supported!!")
(let [n-elems (dec (count offsets))
missing (no-missing n-elems)
{:keys [compression-type buffers]}
(compress-record-batch-buffers [missing offsets byte-data] options)
(->> [missing offsets byte-data] (compress-record-batch-buffers options))
buffer-entries (buffers->buf-entries buffers)
enc-id (.getId ^DictionaryEncoding encoding)
offset-len (pad (byte-length offsets))
Expand Down Expand Up @@ -1116,13 +1120,13 @@ Dependent block frames are not supported!!")
retval))


(defn- col->buffers
(defn col->buffers
[col options]
(let [col-dt (casting/un-alias-datatype (dtype/elemwise-datatype col))
col-dt (if (and (:strings-as-text? options) (= col-dt :string))
:text
col-dt)
cbuf (dtype/->buffer col)]
cbuf (tech.v3.datatype.protocols/->buffer col)]
;;Get the data as a datatype safe for conversion into a
;;nio buffer.
(if (casting/numeric-type? col-dt)
Expand Down Expand Up @@ -1150,11 +1154,10 @@ Dependent block frames are not supported!!")
:text
(let [byte-data (dtype/make-list :int8)
offsets (dtype/make-list :int32)]
(pfor/doiter
strdata cbuf
(let [strdata (str (or strdata ""))]
(.add offsets (.size byte-data))
(.addAll byte-data (dtype/->buffer (.getBytes strdata)))))
(reduce (fn [_ strdata]
(let [strdata (str (or strdata ""))]
(.add offsets (.size byte-data))
(.addAllReducible byte-data (ArrayLists/toList (.getBytes strdata))))) nil cbuf)
(.add offsets (.size byte-data))
[(nio-buffer/as-nio-buffer offsets)
(nio-buffer/as-nio-buffer byte-data)])))))
Expand All @@ -1171,25 +1174,25 @@ Dependent block frames are not supported!!")
(dtype/->byte-array))
nodes-buffs-lens
(->> (ds-base/columns dataset)
(map (fn [col]
(let [col-missing (ds-proto/missing col)
n-missing (dtype/ecount col-missing)
valid-buf (if (== 0 n-missing)
all-valid-buf
(missing-bytes col-missing all-valid-buf))
buffers (vec (concat [valid-buf]
(col->buffers col options)))
lengths (map (comp pad byte-length) buffers)
col-len (long (apply + lengths))]
{:node {:n-elems n-rows
:n-null-entries n-missing}
:buffers buffers
:length col-len}))))
nodes (map :node nodes-buffs-lens)
(hamf/pmap (fn [col]
(let [col-missing (ds-proto/missing col)
n-missing (dtype/ecount col-missing)
valid-buf (if (== 0 n-missing)
all-valid-buf
(missing-bytes col-missing all-valid-buf))
buffers (hamf/concatv [valid-buf] (col->buffers col options))
lengths (hamf/mapv (comp pad byte-length) buffers)
col-len (long (reduce + 0 lengths))]
{:node {:n-elems n-rows
:n-null-entries n-missing}
:buffers buffers
:length col-len})))
(vec))
nodes (lznc/map :node nodes-buffs-lens)
{:keys [compression-type buffers]}
(compress-record-batch-buffers
(mapcat :buffers nodes-buffs-lens)
options)
(->> (lznc/map :buffers nodes-buffs-lens)
(lznc/apply-concat)
(compress-record-batch-buffers options))

buf-entries (buffers->buf-entries buffers)
last-entry (last buf-entries)
Expand Down

0 comments on commit 69d62ee

Please sign in to comment.