microsoft
diff --git a/‎lib/api/tensor-impl.ts
+5-12 b/‎lib/api/tensor-impl.ts
+5-12
diff --git a/‎lib/backends/cpu/ops/argMax.ts
+4-5 b/‎lib/backends/cpu/ops/argMax.ts
+4-5
diff --git a/‎lib/backends/cpu/ops/binary-op.ts
+6-10 b/‎lib/backends/cpu/ops/binary-op.ts
+6-10
diff --git a/‎lib/backends/cpu/ops/conv.ts
+124-80 b/‎lib/backends/cpu/ops/conv.ts
+124-80
@@ -48,7 +48,6 @@ export class Tensor implements TensorInterface {
   get(...indices: number[]): ElementType;
   get(indices: ReadonlyArray<number>): ElementType;
   get(indices?: ReadonlyArray<number>|number, ...rest: number[]): ElementType {
-    let flatIndices = 0;
     let indexArray: ReadonlyArray<number> = [];
     if (typeof indices === 'number') {
       indexArray = [indices, ...rest];
@@ -67,18 +66,17 @@ export class Tensor implements TensorInterface {
       if (dim >= this.dims[idx]) {
         throw new RangeError(`Input index array dims don't match the tensor dims.`);
       }
-      flatIndices += idx < indexArray.length - 1 ? dim * this.dims.slice(idx + 1).reduce((a, b) => a * b) : dim;
     });
+    const value = this.internalTensor.get(indexArray);
     if (this.type === 'bool') {
-      return this.data[flatIndices] === 1 ? true : false;
+      return value === 1 ? true : false;
     }
-    return this.data[flatIndices];
+    return value;
   }
   set(value: ElementType, ...indices: number[]): void;
   set(value: ElementType, indices: ReadonlyArray<number>): void;
   set(value: ElementType, indices?: ReadonlyArray<number>|number, ...rest: number[]) {
     Utils.matchElementType(this.type, value);
-    let flatIndices = 0;
     let indexArray: ReadonlyArray<number> = [];
     if (typeof indices === 'number') {
       indexArray = [indices, ...rest];
@@ -97,17 +95,12 @@ export class Tensor implements TensorInterface {
       if (dim >= this.dims[idx]) {
         throw new RangeError(`Input index array dims don't match the tensor dims.`);
       }
-      flatIndices += idx < indexArray.length - 1 ? dim * this.dims.slice(idx + 1).reduce((a, b) => a * b) : dim;
     });
 
     if (typeof value === 'boolean') {
-      this.data[flatIndices] = value ? 1 : 0;
-    } else if (typeof value === 'string') {
-      this.data[flatIndices] = value;
-    } else if (ArrayBuffer.isView(this.data)) {
-      this.data.set([value], flatIndices);
+      this.internalTensor.set(indexArray, value ? 1 : 0);
     } else {
-      throw new TypeError(`Value type is not supported. `);
+      this.internalTensor.set(indexArray, value);
     }
   }
 }
@@ -10,13 +10,13 @@ export class CpuArgMax extends ArgMax {
   }
 }
 
-export function argMax(x: Tensor, axis: number, keepdims: number): Tensor {
+export function argMax(x: Tensor, axis: number, keepdims: boolean): Tensor {
   const rank = x.dims ? x.dims.length : 1;
   axis = ShapeUtil.parseAxis(axis, rank);
-  const outputDims = ReduceUtil.calcReduceShape(x.dims.slice(0), [axis], 1);
+  const outputDims = ReduceUtil.calcReduceShape(x.dims, [axis], true);
   const X = x.data;
   const Y = new Int32Array(ShapeUtil.size(outputDims));
-  const blockSize = axis >= x.dims.length ? 1 : ShapeUtil.size(x.dims.slice(axis + 1));
+  const blockSize = ShapeUtil.sizeFromDimension(x.dims, axis + 1);
   const strides = ShapeUtil.computeStrides(outputDims);
   const inputStrides = ShapeUtil.computeStrides(x.dims);
   const indicesY = new Array(x.dims.length);
@@ -38,6 +38,5 @@ export function argMax(x: Tensor, axis: number, keepdims: number): Tensor {
   }
 
   return new Tensor(
-      keepdims ? outputDims : ReduceUtil.calcReduceShape(x.dims.slice(0), [axis], keepdims), 'int32', undefined,
-      undefined, Y);
+      keepdims ? outputDims : ReduceUtil.calcReduceShape(x.dims, [axis], keepdims), 'int32', undefined, undefined, Y);
 }
@@ -1,8 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT license.
 
-import ndarray from 'ndarray';
-
 import {Attribute} from '../../../attribute';
 import {BinaryOp} from '../../../ops/binary-op';
 import {Tensor} from '../../../tensor';
@@ -32,19 +30,17 @@ export class CpuBinaryOp extends BinaryOp {
   }
 
   run(inferenceHandler: CpuInferenceHandler, inputs: Tensor[]): Tensor[] {
-    const output = binaryOp(inputs[0], inputs[1], this.opLambda!, this.resultType);
+    const output = binaryOp(inputs[0], inputs[1], this.opLambda!, false, this.resultType);
     return [output];
   }
 }
 
-export function binaryOp(
-    x: Tensor, y: Tensor, opLambda: (e1: number, e2: number) => number, resultType?: Tensor.DataType): Tensor {
-  const result =
-      BroadcastUtil.calc(ndarray(x.numberData, x.dims.slice(0)), ndarray(y.numberData, y.dims.slice(0)), opLambda);
+function binaryOp(
+    x: Tensor, y: Tensor, opLambda: (e1: number, e2: number) => number, inplace: boolean,
+    resultType?: Tensor.DataType): Tensor {
+  const result = BroadcastUtil.calc(x, y, opLambda, inplace, resultType);
   if (!result) {
     throw new Error('not broadcastable');
   }
-  const output = new Tensor(result.shape, resultType ? resultType : x.type);
-  output.numberData.set(result.data);
-  return output;
+  return result;
 }
@@ -1,17 +1,19 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT license.
 
-import ndarray from 'ndarray';
-import matrixProduct from 'ndarray-gemm';
-import nd_ops from 'ndarray-ops';
-
 import {Conv} from '../../../ops/conv';
 import {Tensor} from '../../../tensor';
 import {PoolConvUtil} from '../../../util';
 import {CpuInferenceHandler} from '../inference-handler';
 
+import {matMul2d} from './matmul';
+
 export class CpuConv extends Conv {
   run(inferenceHandler: CpuInferenceHandler, inputs: Tensor[]): Tensor[] {
+    const x = inputs[0];
+    const w = inputs[1];
+    const b = inputs.length === 3 ? inputs[2] : undefined;
+
     // if kernelShape is not specified in the attributes of this op, infer it from the weight tensor dims
     if (this.kernelShape.length === 0) {
       const wDims = inputs[1].dims;
@@ -20,88 +22,71 @@ export class CpuConv extends Conv {
       }
     }
 
-    const output = conv(
-        inputs[0], inputs[1], inputs.length === 3 ? inputs[2] : null, this.autoPad, this.dilations, this.group,
-        this.kernelShape, this.pads, this.strides);
-    return [output];
-  }
-}
-
-export function conv(
-    x: Tensor, w: Tensor, b: Tensor|null, autoPad: string, dilations: number[], group: number, kernelShape: number[],
-    pads: number[], strides: number[]): Tensor {
-  let ndx = ndarray(x.floatData as Float32Array, x.dims.slice(0)).transpose(0, 2, 3, 1);
-  const ndk = ndarray(w.floatData as Float32Array, w.dims.slice(0)).transpose(2, 3, 1, 0);
-
-  // adjusting pads based on 'autoPad' attribute
-  PoolConvUtil.adjustPadsBasedOnAutoPad(x.dims, strides, dilations, kernelShape, pads, autoPad);
-
-  // padding if needed
-  const localPads: Array<[number, number]> = [[0, 0], [pads[0], pads[2]], [pads[1], pads[3]], [0, 0]];
-  const padTotal = localPads.reduce((s, p) => s + p[0] + p[1], 0);
-  if (padTotal !== 0) {
-    const shape: number[] = ndx.shape;
-    const newShape = shape.map((len, index) => len + localPads[index][0] + localPads[index][1]);
-    const newSize = newShape.reduce((m, v) => m * v, 1);
-    const ndp = ndarray(new Float32Array(newSize), newShape);
-    const hiPoint = localPads.map((pair, index) => newShape[index] - pair[1]);
-    const loPoint = localPads.map(pair => pair[0]);
-    const originalSlice = ndp.hi(...hiPoint).lo(...loPoint);
-    nd_ops.assign(originalSlice, ndx);
-    ndx = ndp;
-  }
-
-  const [batchSize, xRows, xCols, xChannels] = ndx.shape;
-  const [wRows, wCols, yChannels] = [ndk.shape[0], ndk.shape[1], ndk.shape[3]];
-
-  // calculate the patch view in source image's size after dilations
-  const pvRows = wRows + (wRows - 1) * (dilations[0] - 1);
-  const pvCols = wCols + (wCols - 1) * (dilations[1] - 1);
+    // create output Tensor after determining output size (after adjusting pads based on 'autoPad' attribute)
+    const outputDims = PoolConvUtil.computeConvOutputShape(
+        x.dims, w.dims, this.strides, this.dilations, this.kernelShape, this.pads, this.autoPad);
+    const y = new Tensor(outputDims, x.type);
 
-  const yRows = Math.floor((xRows - pvRows + strides[0]) / strides[0]);
-  const yCols = Math.floor((xCols - pvCols + strides[1]) / strides[1]);
-
-  const ySize = batchSize * yRows * yCols * yChannels;
-  const patchSize = wRows * wCols * xChannels;
-
-  const ndf = ndarray(new Float64Array(ndk.size), [patchSize, yChannels]);
-  const patch = ndarray(new Float64Array(patchSize), [wRows, wCols, xChannels]);
-  for (let yChannel = 0; yChannel < yChannels; ++yChannel) {
-    nd_ops.assign(patch, ndk.pick(null, null, null, yChannel));
-    const reshapedPatch = ndarray(patch.data, [patchSize]);
-    nd_ops.assign(ndf.pick(null, yChannel), reshapedPatch);
+    conv2d(y, x, w, b, this.dilations, this.group, this.pads, this.strides);
+    return [y];
   }
+}
 
-  const yArray = new Float64Array(ySize);
-  const pixelVec = ndarray(new Float64Array(yChannels), [1, yChannels]);
-  let offset = 0;
-  for (let b = 0; b < batchSize; ++b) {
-    const image = ndx.pick(b, null, null, null);
-    for (let yRow = 0; yRow < yRows; ++yRow) {
-      const xRowStart = yRow * strides[0];
-      for (let yCol = 0; yCol < yCols; ++yCol) {
-        const xColStart = yCol * strides[1];
-
-        const patchView = image.hi(xRowStart + pvRows, xColStart + pvCols, xChannels)
-                              .lo(xRowStart, xColStart, 0)
-                              .step(dilations[0], dilations[1], 1);
-        nd_ops.assign(patch, patchView);
-        const pvVec = ndarray(patch.data, [1, patchSize]);
-        matrixProduct(pixelVec, pvVec, ndf);
-        yArray.set(pixelVec.data, offset);
-        offset += yChannels;
-      }
+// tslint:disable: variable-name
+export function conv2d(
+    Y: Tensor, X: Tensor, W: Tensor, B: Tensor|undefined, dilations: ReadonlyArray<number>, group: number,
+    pads: ReadonlyArray<number>, strides: ReadonlyArray<number>): void {
+  const input_num = X.dims[0];
+  const input_channels = X.dims[1];
+  const input_height = X.dims[2];
+  const input_width = X.dims[3];
+
+  const filter_num = W.dims[0];
+  const filter_channels = W.dims[1];
+  const filter_height = W.dims[2];
+  const filter_width = W.dims[3];
+  const filter_size = filter_num * filter_channels * filter_height * filter_width;
+  const kernel_shape = [filter_height, filter_width];
+
+  const output_num = Y.dims[0];
+  const output_channels = Y.dims[1];
+  const output_height = Y.dims[2];
+  const output_width = Y.dims[3];
+  const output_size = output_num * output_channels * output_height * output_width;
+
+  const input_image_size = input_height * input_width;
+  const output_image_size = output_height * output_width;
+  const kernel_size = kernel_shape[0] * kernel_shape[1];
+  const X_offset = input_channels / group * input_image_size;
+  const Y_offset = output_size / output_num / group;
+  const W_offset = filter_size / group;
+  const kernel_dim = input_channels / group * kernel_size;
+  const col_buffer_size = kernel_dim * output_image_size;
+
+  const col_buffer_data = new Float32Array(col_buffer_size);
+
+  for (let image_id = 0; image_id < input_num; ++image_id) {
+    let X_image_offset = 0;
+    let Y_image_offset = 0;
+    for (let group_id = 0; group_id < group; ++group_id) {
+      im2col(
+          X.floatData.subarray(X_image_offset + group_id * X_offset), col_buffer_data, input_channels / group,
+          input_height, input_width, kernel_shape[0], kernel_shape[1], dilations[0], dilations[1], pads[0], pads[1],
+          pads[2], pads[3], strides[0], strides[1]);
+
+      matMul2d(
+          W.floatData.subarray(group_id * W_offset), col_buffer_data,
+          Y.floatData.subarray(Y_image_offset + group_id * Y_offset), false, false, 1, 0, filter_num / group,
+          output_image_size, kernel_dim);
     }
+
+    X_image_offset += X_offset * group;
+    Y_image_offset += Y_offset * group;
   }
-  const ndy = ndarray(yArray, [batchSize, yRows, yCols, yChannels]);
-  const ndyTransed = ndarray(new Float32Array(ySize), [batchSize, yChannels, yRows, yCols]);
-  nd_ops.assign(ndyTransed, ndy.transpose(0, 3, 1, 2));
-  const Y = new Tensor(ndyTransed.shape, 'float32');
-  Y.floatData.set(ndyTransed.data);
 
   // Add bias if applicable
-  if (b) {
-    const biasData = b.numberData;
+  if (B) {
+    const biasData = B.floatData;
     const outputData = Y.floatData;
     const batchSize = Y.dims[0];
     const outputChannels = Y.dims[1];
@@ -116,6 +101,65 @@ export function conv(
       }
     }
   }
+}
+
+function im2col(
+    data_im: Float32Array|Float64Array, data_col: Float32Array|Float64Array, channels: number, height: number,
+    width: number, kernel_h: number, kernel_w: number, dilation_h: number, dilation_w: number, pad_t: number,
+    pad_l: number, pad_b: number, pad_r: number, stride_h: number, stride_w: number) {
+  const output_h = ~~((height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h) + 1;
+  const output_w = ~~((width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w) + 1;
+
+  // Fast path for zero padding and no dilation
+  // From Torch, THNN_(unfolded_copy)
+  if (dilation_h === 1 && dilation_w === 1 && pad_l === 0 && pad_r === 0 && pad_t === 0 && pad_b === 0) {
+    for (let k = 0; k < channels * kernel_h * kernel_w; k++) {
+      const nip = ~~(k / (kernel_h * kernel_w));
+      const rest = k % (kernel_h * kernel_w);
+      const kh = ~~(rest / kernel_w);
+      const kw = rest % kernel_w;
+      const dst_offset = nip * (kernel_h * kernel_w * output_h * output_w) + kh * (kernel_w * output_h * output_w) +
+          kw * (output_h * output_w);
+      const src_offset = nip * (height * width);
+      for (let y = 0; y < output_h; y++) {
+        const iy = y * stride_h + kh;
+        const ix = kw;
+        if (stride_w === 1) {
+          data_col.set(
+              data_im.subarray(src_offset + iy * width + ix, src_offset + iy * width + ix + output_w),
+              dst_offset + y * output_w);
+        } else {
+          for (let x = 0; x < output_w; x++) {
+            data_col[dst_offset + (y * output_w + x)] = data_im[src_offset + (iy * width + ix + x * stride_w)];
+          }
+        }
+      }
+    }
+    return;
+  }
 
-  return Y;
+  // Baseline
+  const dkernel_h = dilation_h * (kernel_h - 1) + 1;
+  const dkernel_w = dilation_w * (kernel_w - 1) + 1;
+
+  const height_col = ~~((height + pad_t + pad_b - dkernel_h) / stride_h) + 1;
+  const width_col = ~~((width + pad_l + pad_r - dkernel_w) / stride_w) + 1;
+
+  const channels_col = channels * kernel_h * kernel_w;
+  for (let c = 0; c < channels_col; ++c) {
+    const w_offset = c % kernel_w;
+    const h_offset = ~~(c / kernel_w) % kernel_h;
+    const c_im = ~~(c / (kernel_h * kernel_w));
+    for (let h = 0; h < height_col; ++h) {
+      for (let w = 0; w < width_col; ++w) {
+        const h_pad = h * stride_h - pad_t + h_offset * dilation_h;
+        const w_pad = w * stride_w - pad_l + w_offset * dilation_w;
+        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) {
+          data_col[(c * height_col + h) * width_col + w] = data_im[(c_im * height + h_pad) * width + w_pad];
+        } else {
+          data_col[(c * height_col + h) * width_col + w] = 0;
+        }
+      }
+    }
+  }
 }
Original file line number	Diff line number	Diff line change
`@@ -10,13 +10,13 @@ export class CpuArgMax extends ArgMax {`
`10`	`10`	`}`
`11`	`11`	`}`
`12`	`12`
`13`		`-export function argMax(x: Tensor, axis: number, keepdims: number): Tensor {`
	`13`	`+export function argMax(x: Tensor, axis: number, keepdims: boolean): Tensor {`
`14`	`14`	`const rank = x.dims ? x.dims.length : 1;`
`15`	`15`	`axis = ShapeUtil.parseAxis(axis, rank);`
`16`		`- const outputDims = ReduceUtil.calcReduceShape(x.dims.slice(0), [axis], 1);`
	`16`	`+ const outputDims = ReduceUtil.calcReduceShape(x.dims, [axis], true);`
`17`	`17`	`const X = x.data;`
`18`	`18`	`const Y = new Int32Array(ShapeUtil.size(outputDims));`
`19`		`- const blockSize = axis >= x.dims.length ? 1 : ShapeUtil.size(x.dims.slice(axis + 1));`
	`19`	`+ const blockSize = ShapeUtil.sizeFromDimension(x.dims, axis + 1);`
`20`	`20`	`const strides = ShapeUtil.computeStrides(outputDims);`
`21`	`21`	`const inputStrides = ShapeUtil.computeStrides(x.dims);`
`22`	`22`	`const indicesY = new Array(x.dims.length);`
`@@ -38,6 +38,5 @@ export function argMax(x: Tensor, axis: number, keepdims: number): Tensor {`
`38`	`38`	`}`
`39`	`39`
`40`	`40`	`return new Tensor(`
`41`		`- keepdims ? outputDims : ReduceUtil.calcReduceShape(x.dims.slice(0), [axis], keepdims), 'int32', undefined,`
`42`		`- undefined, Y);`
	`41`	`+ keepdims ? outputDims : ReduceUtil.calcReduceShape(x.dims, [axis], keepdims), 'int32', undefined, undefined, Y);`
`43`	`42`	`}`