added handling of batch inputs + face landmark net works with batch inputs now

5753e5d0 · vincent · 1c89e90a · 5753e5d0 · 5753e5d0 · 5753e5d0
Commit 5753e5d0 authored Jul 02, 2018 by vincent
21 changed files
--- a/examples/views/detectAndDrawFaces.html
+++ b/examples/views/detectAndDrawFaces.html
@@ -84,9 +84,12 @@
      const detections = await faceapi.locateFaces(input, minConfidence)
      faceapi.drawDetection('overlay', detections.map(det => det.forSize(width, height)))

-      const faceImages = await faceapi.extractFaces(input.canvases[0], detections)
+      const faceImages = await faceapi.extractFaces(input.inputs[0], detections)
      $('#facesContainer').empty()
      faceImages.forEach(canvas => $('#facesContainer').append(canvas))
+
+      // free memory for input tensors
+      input.dispose()
    }

    async function onSelectionChanged(uri) {

--- a/examples/views/detectAndDrawLandmarks.html
+++ b/examples/views/detectAndDrawLandmarks.html
@@ -103,6 +103,9 @@

      faceapi.drawLandmarks(canvas, landmarksByFace, { lineWidth: drawLines ? 2 : 4, drawLines, color: 'red' })
      faceapi.drawDetection('overlay', locations.map(det => det.forSize(width, height)))
+
+      // free memory for input tensors
+      input.dispose()
    }

    async function run() {

--- a/examples/views/faceAlignment.html
+++ b/examples/views/faceAlignment.html
@@ -84,7 +84,7 @@
      const input = await faceapi.toNetInput(inputImgEl)
      const locations = await faceapi.locateFaces(input, minConfidence)

-      const faceImages = await faceapi.extractFaces(input.canvases[0], locations)
+      const faceImages = await faceapi.extractFaces(input.inputs[0], locations)

      // detect landmarks and get the aligned face image bounding boxes
      const alignedFaceBoxes = await Promise.all(faceImages.map(
@@ -93,7 +93,10 @@
          return faceLandmarks.align(locations[i])
        }
      ))
-      const alignedFaceImages = await faceapi.extractFaces(input.canvases[0], alignedFaceBoxes)
+      const alignedFaceImages = await faceapi.extractFaces(input.inputs[0], alignedFaceBoxes)
+
+      // free memory for input tensors
+      input.dispose()

      $('#facesContainer').empty()
      faceImages.forEach(async (faceCanvas, i) => {

--- a/examples/views/faceDetectionVideo.html
+++ b/examples/views/faceDetectionVideo.html
@@ -72,14 +72,13 @@
      if(videoEl.paused || videoEl.ended || !modelLoaded)
        return false

-      const input = await faceapi.toNetInput(videoEl)
-      const { width, height } = input
+      const { width, height } = faceapi.getMediaDimensions(videoEl)
      const canvas = $('#overlay').get(0)
      canvas.width = width
      canvas.height = height

      const ts = Date.now()
-      result = await faceapi.locateFaces(input, minConfidence)
+      result = await faceapi.locateFaces(videoEl, minConfidence)
      displayTimeStats(Date.now() - ts)

      faceapi.drawDetection('overlay', result.map(det => det.forSize(width, height)))

--- a/src/NetInput.ts
+++ b/src/NetInput.ts
-import { Dimensions, TMediaElement } from './types';
+import * as tf from '@tensorflow/tfjs-core';
+
+import { isTensor3D, isTensor4D } from './commons/isTensor';
+import { padToSquare } from './padToSquare';
+import { Point } from './Point';
+import { TResolvedNetInput } from './types';
 import { createCanvasFromMedia } from './utils';

 export class NetInput {
-  private _canvases: HTMLCanvasElement[]
+  private _inputs: tf.Tensor3D[] = []
+  private _isManaged: boolean = false

-  constructor(
-    medias: Array<TMediaElement>,
-    dims?: Dimensions
-  ) {
-    this._canvases = []
-    medias.forEach(m => this.initCanvas(m, dims))
-  }
+  private _inputDimensions: number[][] = []
+  private _paddings: Point[] = []
+
+  constructor(inputs: tf.Tensor4D | Array<TResolvedNetInput>) {
+    if (isTensor4D(inputs)) {
+      this._inputs = tf.unstack(inputs as tf.Tensor4D) as tf.Tensor3D[]
+    }
+
+    if (Array.isArray(inputs)) {
+      this._inputs = inputs.map(input => {
+        if (isTensor3D(input)) {
+          // TODO: make sure not to dispose original tensors passed in by the user
+          return tf.clone(input as tf.Tensor3D)
+        }

-  private initCanvas(media: TMediaElement, dims?: Dimensions) {
-    if (media instanceof HTMLCanvasElement) {
-      this._canvases.push(media)
-      return
+        return tf.fromPixels(
+          input instanceof HTMLCanvasElement ? input : createCanvasFromMedia(input as HTMLImageElement | HTMLVideoElement)
+        )
+      })
    }
+    this._inputDimensions = this._inputs.map(t => t.shape)
+  }
+
+  public get inputs(): tf.Tensor3D[] {
+    return this._inputs
+  }
+
+  public get isManaged(): boolean {
+    return this._isManaged
+  }
+
+  public get batchSize(): number {
+    return this._inputs.length
+  }
+
+  public get inputDimensions(): number[][] {
+    return this._inputDimensions
+  }
+
+  public get paddings(): Point[] {
+    return this._paddings
+  }
+
+  public getInputDimensions(batchIdx: number): number[] {
+    return this._inputDimensions[batchIdx]
+  }
+
+  public getInputHeight(batchIdx: number): number {
+    return this._inputDimensions[batchIdx][0]
+  }
+
+  public getInputWidth(batchIdx: number): number {
+    return this._inputDimensions[batchIdx][1]
+  }
+
+  public getPaddings(batchIdx: number): Point {
+    return this._paddings[batchIdx]
+  }
+
+  public toBatchTensor(inputSize: number, isCenterInputs: boolean = true): tf.Tensor4D {
+
+    return tf.tidy(() => {
+
+      const inputTensors = this._inputs.map((inputTensor: tf.Tensor3D) => {
+        const [originalHeight, originalWidth] = inputTensor.shape
+
+        let imgTensor = inputTensor.expandDims().toFloat() as tf.Tensor4D
+        imgTensor = padToSquare(imgTensor, isCenterInputs)
+
+        const [heightAfterPadding, widthAfterPadding] = imgTensor.shape.slice(1)
+
+        if (heightAfterPadding !== inputSize || widthAfterPadding !== inputSize) {
+          imgTensor = tf.image.resizeBilinear(imgTensor, [inputSize, inputSize])
+        }
+
+        this._paddings.push(new Point(
+          widthAfterPadding - originalWidth,
+          heightAfterPadding - originalHeight
+        ))
+        return imgTensor
+      })
+
+      const batchTensor = tf.stack(inputTensors).as4D(this.batchSize, inputSize, inputSize, 3)
+
+      if (this.isManaged) {
+        this.dispose()
+      }
+
+      return batchTensor
+    })
+  }

-    this._canvases.push(createCanvasFromMedia(media, dims))
+  /**
+   *  By setting the isManaged flag, all newly created tensors will be automatically
+   *  automatically disposed after the batch tensor has been created
+   */
+  public managed() {
+    this._isManaged = true
+    return this
  }

-  public get canvases() : HTMLCanvasElement[] {
-    return this._canvases
+  public dispose() {
+    this._inputs.forEach(t => t.dispose())
  }
 }
\ No newline at end of file
--- a/src/allFacesFactory.ts
+++ b/src/allFacesFactory.ts
-import * as tf from '@tensorflow/tfjs-core';
-
 import { extractFaceTensors } from './extractFaceTensors';
 import { FaceDetectionNet } from './faceDetectionNet/FaceDetectionNet';
 import { FaceLandmarkNet } from './faceLandmarkNet/FaceLandmarkNet';
 import { FaceLandmarks } from './faceLandmarkNet/FaceLandmarks';
 import { FaceRecognitionNet } from './faceRecognitionNet/FaceRecognitionNet';
 import { FullFaceDescription } from './FullFaceDescription';
-import { NetInput } from './NetInput';
 import { TNetInput } from './types';

 export function allFacesFactory(
@@ -15,22 +12,25 @@ export function allFacesFactory(
  recognitionNet: FaceRecognitionNet
 ) {
  return async function(
-    input: tf.Tensor | NetInput | TNetInput,
+    input: TNetInput,
    minConfidence: number
  ): Promise<FullFaceDescription[]> {

    const detections = await detectionNet.locateFaces(input, minConfidence)

    const faceTensors = await extractFaceTensors(input, detections)
+    /**
    const faceLandmarksByFace = await Promise.all(faceTensors.map(
      faceTensor => landmarkNet.detectLandmarks(faceTensor)
    )) as FaceLandmarks[]
+     */
+    const faceLandmarksByFace = await landmarkNet.detectLandmarks(faceTensors) as FaceLandmarks[]

    faceTensors.forEach(t => t.dispose())

-    const alignedFaceBoxes = await Promise.all(faceLandmarksByFace.map(
+    const alignedFaceBoxes = faceLandmarksByFace.map(
      (landmarks, i) => landmarks.align(detections[i].getBox())
-    ))
+    )
    const alignedFaceTensors = await extractFaceTensors(input, alignedFaceBoxes)

    const descriptors = await Promise.all(alignedFaceTensors.map(

--- a/src/commons/getImageTensor.ts
+++ b/src/commons/getImageTensor.ts
-import * as tf from '@tensorflow/tfjs-core';
-
-import { NetInput } from '../NetInput';
-import { tensorTo4D } from './tensorTo4D';
-
-export function getImageTensor(input: tf.Tensor | NetInput): tf.Tensor4D {
-  return tf.tidy(() => {
-    if (input instanceof tf.Tensor) {
-      return tensorTo4D(input)
-    }
-
-    if (!(input instanceof NetInput)) {
-      throw new Error('getImageTensor - expected input to be a tensor or an instance of NetInput')
-    }
-
-    if (input.canvases.length > 1) {
-      throw new Error('getImageTensor - batch input is not accepted here')
-    }
-
-    return tf.fromPixels(input.canvases[0]).expandDims(0).toFloat() as tf.Tensor4D
-  })
-}
\ No newline at end of file
--- a/src/commons/isMediaElement.ts
+++ b/src/commons/isMediaElement.ts
+export function isMediaElement(input: any) {
+  return input instanceof HTMLImageElement
+    || input instanceof HTMLVideoElement
+    || input instanceof HTMLCanvasElement
+}
\ No newline at end of file
--- a/src/commons/isTensor.ts
+++ b/src/commons/isTensor.ts
 import * as tf from '@tensorflow/tfjs-core';

-export function isTensor(tensor: tf.Tensor, dim: number) {
+export function isTensor(tensor: any, dim: number) {
  return tensor instanceof tf.Tensor && tensor.shape.length === dim
 }

-export function isTensor1D(tensor: tf.Tensor) {
+export function isTensor1D(tensor: any) {
  return isTensor(tensor, 1)
 }

-export function isTensor2D(tensor: tf.Tensor) {
+export function isTensor2D(tensor: any) {
  return isTensor(tensor, 2)
 }

-export function isTensor3D(tensor: tf.Tensor) {
+export function isTensor3D(tensor: any) {
  return isTensor(tensor, 3)
 }

-export function isTensor4D(tensor: tf.Tensor) {
+export function isTensor4D(tensor: any) {
  return isTensor(tensor, 4)
 }
\ No newline at end of file
--- a/src/commons/toInputTensor.ts
+++ b/src/commons/toInputTensor.ts
-import * as tf from '@tensorflow/tfjs-core';
-
-import { NetInput } from '../NetInput';
-import { padToSquare } from '../padToSquare';
-import { tensorTo4D } from './tensorTo4D';
-import { BatchReshapeInfo } from './types';
-
-export function toInputTensor(
-  input: tf.Tensor | tf.Tensor[] | NetInput,
-  inputSize: number,
-  center: boolean = true
-): { batchTensor: tf.Tensor4D, batchInfo: BatchReshapeInfo[] } {
-
-  if (!(input instanceof tf.Tensor) && !(input instanceof NetInput)) {
-    throw new Error('toInputTensor - expected input to be a tensor of an instance of NetInput')
-  }
-
-  return tf.tidy(() => {
-
-    const inputTensors = input instanceof NetInput
-      ? input.canvases.map(c => tf.expandDims(tf.fromPixels(c)))
-      : [tensorTo4D(input)]
-
-    const preprocessedTensors: tf.Tensor4D[] = []
-    const batchInfo: BatchReshapeInfo[] = []
-
-    inputTensors.forEach((inputTensor: tf.Tensor4D) => {
-      const [originalHeight, originalWidth] = inputTensor.shape.slice(1)
-
-      let imgTensor = padToSquare(inputTensor.toFloat(), center)
-      const [heightAfterPadding, widthAfterPadding] = imgTensor.shape.slice(1)
-
-      if (heightAfterPadding !== inputSize || widthAfterPadding !== inputSize) {
-        imgTensor = tf.image.resizeBilinear(imgTensor, [inputSize, inputSize])
-      }
-
-      preprocessedTensors.push(imgTensor)
-      batchInfo.push({
-        originalWidth,
-        originalHeight,
-        paddingX: widthAfterPadding - originalWidth,
-        paddingY: heightAfterPadding - originalHeight
-      })
-    })
-
-    const batchSize = inputTensors.length
-
-    return {
-      batchTensor: tf.stack(preprocessedTensors).as4D(batchSize, inputSize, inputSize, 3),
-      batchInfo
-    }
-  })
-}
\ No newline at end of file
--- a/src/drawing/index.ts
+++ b/src/drawing/index.ts
 import { FaceDetection } from '../faceDetectionNet/FaceDetection';
 import { FaceLandmarks } from '../faceLandmarkNet/FaceLandmarks';
 import { Point } from '../Point';
-import { getContext2dOrThrow, getElement, round } from '../utils';
+import { getContext2dOrThrow, resolveInput, round } from '../utils';
 import { DrawBoxOptions, DrawLandmarksOptions, DrawOptions, DrawTextOptions } from './types';

 export function getDefaultDrawOptions(): DrawOptions {
@@ -55,7 +55,7 @@ export function drawDetection(
  detection: FaceDetection | FaceDetection[],
  options?: DrawBoxOptions & DrawTextOptions & { withScore: boolean }
 ) {
-  const canvas = getElement(canvasArg)
+  const canvas = resolveInput(canvasArg)
  if (!(canvas instanceof HTMLCanvasElement)) {
    throw new Error('drawBox - expected canvas to be of type: HTMLCanvasElement')
  }
@@ -132,7 +132,7 @@ export function drawLandmarks(
  faceLandmarks: FaceLandmarks | FaceLandmarks[],
  options?: DrawLandmarksOptions & { drawLines: boolean }
 ) {
-  const canvas = getElement(canvasArg)
+  const canvas = resolveInput(canvasArg)
  if (!(canvas instanceof HTMLCanvasElement)) {
    throw new Error('drawLandmarks - expected canvas to be of type: HTMLCanvasElement')
  }

--- a/src/extractFaceTensors.ts
+++ b/src/extractFaceTensors.ts
 import * as tf from '@tensorflow/tfjs-core';

-import { getImageTensor } from './commons/getImageTensor';
 import { FaceDetection } from './faceDetectionNet/FaceDetection';
-import { NetInput } from './NetInput';
 import { Rect } from './Rect';
 import { toNetInput } from './toNetInput';
 import { TNetInput } from './types';
@@ -18,16 +16,21 @@ import { TNetInput } from './types';
 * @returns Tensors of the corresponding image region for each detected face.
 */
 export async function extractFaceTensors(
-  input: tf.Tensor | NetInput | TNetInput,
-  detections: Array<FaceDetection|Rect>
+  input: TNetInput,
+  detections: Array<FaceDetection | Rect>
 ): Promise<tf.Tensor4D[]> {

-  const image = input instanceof tf.Tensor
-    ? input
-    : await toNetInput(input)
+  const netInput = await toNetInput(input, true)
+
+  if (netInput.batchSize > 1) {
+    if (netInput.isManaged) {
+      netInput.dispose()
+    }
+    throw new Error('extractFaceTensors - batchSize > 1 not supported')
+  }

  return tf.tidy(() => {
-    const imgTensor = getImageTensor(image)
+    const imgTensor = netInput.inputs[0].expandDims().toFloat() as tf.Tensor4D

    const [imgHeight, imgWidth, numChannels] = imgTensor.shape.slice(1)

@@ -40,6 +43,9 @@ export async function extractFaceTensors(
      tf.slice(imgTensor, [0, y, x, 0], [1, height, width, numChannels])
    )

+    if (netInput.isManaged) {
+      netInput.dispose()
+    }
    return faceTensors
  })
 }
\ No newline at end of file
--- a/src/extractFaces.ts
+++ b/src/extractFaces.ts
 import { FaceDetection } from './faceDetectionNet/FaceDetection';
 import { Rect } from './Rect';
-import { createCanvas, getContext2dOrThrow } from './utils';
+import { toNetInput } from './toNetInput';
+import { TNetInput } from './types';
+import { createCanvas, getContext2dOrThrow, imageTensorToCanvas } from './utils';

 /**
 * Extracts the image regions containing the detected faces.
@@ -9,15 +11,31 @@ import { createCanvas, getContext2dOrThrow } from './utils';
 * @param detections The face detection results or face bounding boxes for that image.
 * @returns The Canvases of the corresponding image region for each detected face.
 */
-export function extractFaces(
-  image: HTMLCanvasElement,
-  detections: Array<FaceDetection|Rect>
-): HTMLCanvasElement[] {
-  const ctx = getContext2dOrThrow(image)
+export async function extractFaces(
+  input: TNetInput,
+  detections: Array<FaceDetection | Rect>
+): Promise<HTMLCanvasElement[]> {
+
+  let canvas = input as HTMLCanvasElement
+
+  if (!(input instanceof HTMLCanvasElement)) {
+    const netInput = await toNetInput(input, true)
+
+    if (netInput.batchSize > 1) {
+      if (netInput.isManaged) {
+        netInput.dispose()
+      }
+      throw new Error('extractFaces - batchSize > 1 not supported')
+    }
+
+    canvas = await imageTensorToCanvas(netInput.inputs[0])
+  }
+
+  const ctx = getContext2dOrThrow(canvas)

  const boxes = detections.map(
    det => det instanceof FaceDetection
-      ? det.forSize(image.width, image.height).getBox().floor()
+      ? det.forSize(canvas.width, canvas.height).getBox().floor()
      : det
  )
  return boxes.map(({ x, y, width, height }) => {

--- a/src/faceDetectionNet/FaceDetectionNet.ts
+++ b/src/faceDetectionNet/FaceDetectionNet.ts
 import * as tf from '@tensorflow/tfjs-core';

-import { getImageTensor } from '../commons/getImageTensor';
 import { NetInput } from '../NetInput';
-import { padToSquare } from '../padToSquare';
 import { Rect } from '../Rect';
 import { toNetInput } from '../toNetInput';
-import { Dimensions, TNetInput } from '../types';
+import { TNetInput } from '../types';
 import { extractParams } from './extractParams';
 import { FaceDetection } from './FaceDetection';
 import { loadQuantizedParams } from './loadQuantizedParams';
@@ -13,7 +11,6 @@ import { mobileNetV1 } from './mobileNetV1';
 import { nonMaxSuppression } from './nonMaxSuppression';
 import { outputLayer } from './outputLayer';
 import { predictionLayer } from './predictionLayer';
-import { resizeLayer } from './resizeLayer';
 import { NetParams } from './types';

 export class FaceDetectionNet {
@@ -36,15 +33,16 @@ export class FaceDetectionNet {
    this._params = extractParams(weights)
  }

-  private forwardTensor(imgTensor: tf.Tensor4D) {
+  public forwardInput(input: NetInput) {
    if (!this._params) {
      throw new Error('FaceDetectionNet - load model before inference')
    }

    return tf.tidy(() => {
+      const batchTensor = input.toBatchTensor(512, false)

-      const resized = resizeLayer(imgTensor) as tf.Tensor4D
-      const features = mobileNetV1(resized, this._params.mobilenetv1_params)
+      const x = tf.sub(tf.mul(batchTensor, tf.scalar(0.007843137718737125)), tf.scalar(1)) as tf.Tensor4D
+      const features = mobileNetV1(x, this._params.mobilenetv1_params)

      const {
        boxPredictions,
@@ -55,44 +53,23 @@ export class FaceDetectionNet {
    })
  }

-  public async forward(input: tf.Tensor | NetInput | TNetInput) {
-    const netInput = input instanceof tf.Tensor
-      ? input
-      : await toNetInput(input)
-
-    return tf.tidy(() =>
-      this.forwardTensor(padToSquare(getImageTensor(netInput)))
-    )
+  public async forward(input: TNetInput) {
+    return this.forwardInput(await toNetInput(input, true))
  }

  public async locateFaces(
-    input: tf.Tensor | NetInput | TNetInput,
+    input: TNetInput,
    minConfidence: number = 0.8,
    maxResults: number = 100,
  ): Promise<FaceDetection[]> {

-    const netInput = input instanceof tf.Tensor
-      ? input
-      : await toNetInput(input)
-
-    let paddedHeightRelative = 1, paddedWidthRelative = 1
-    let imageDimensions: Dimensions | undefined
+    const netInput = await toNetInput(input, true)

    const {
      boxes: _boxes,
      scores: _scores
-    } = tf.tidy(() => {
-
-      let imgTensor = getImageTensor(netInput)
-      const [height, width] = imgTensor.shape.slice(1)
-      imageDimensions = { width, height }
+    } = this.forwardInput(netInput)

-      imgTensor = padToSquare(imgTensor)
-      paddedHeightRelative = imgTensor.shape[1] / height
-      paddedWidthRelative = imgTensor.shape[2] / width
-
-      return this.forwardTensor(imgTensor)
-    })

    // TODO batches
    const boxes = _boxes[0]
@@ -114,6 +91,10 @@ export class FaceDetectionNet {
      minConfidence
    )

+
+    const paddedHeightRelative = (netInput.getPaddings(0).y + netInput.getInputHeight(0)) / netInput.getInputHeight(0)
+    const paddedWidthRelative = (netInput.getPaddings(0).x + netInput.getInputWidth(0)) / netInput.getInputWidth(0)
+
    const results = indices
      .map(idx => {
        const [top, bottom] = [
@@ -132,7 +113,10 @@ export class FaceDetectionNet {
            right - left,
            bottom - top
          ),
-          imageDimensions as Dimensions
+          {
+            height: netInput.getInputHeight(0),
+            width: netInput.getInputWidth(0)
+          }
        )
      })


--- a/src/faceDetectionNet/resizeLayer.ts
+++ b/src/faceDetectionNet/resizeLayer.ts
-import * as tf from '@tensorflow/tfjs-core';
-
-const resizedImageSize = [512, 512] as [number, number]
-const weight = tf.scalar(0.007843137718737125)
-const bias = tf.scalar(1)
-
-export function resizeLayer(x: tf.Tensor4D) {
-  return tf.tidy(() => {
-
-    const resized = tf.image.resizeBilinear(x, resizedImageSize, false)
-    return tf.sub(tf.mul(resized, weight), bias)
-
-  })
-}
\ No newline at end of file
--- a/src/faceLandmarkNet/FaceLandmarkNet.ts
+++ b/src/faceLandmarkNet/FaceLandmarkNet.ts
 import * as tf from '@tensorflow/tfjs-core';

 import { convLayer } from '../commons/convLayer';
-import { toInputTensor } from '../commons/toInputTensor';
 import { ConvParams } from '../commons/types';
 import { NetInput } from '../NetInput';
 import { Point } from '../Point';
@@ -42,7 +41,7 @@ export class FaceLandmarkNet {
    this._params = extractParams(weights)
  }

-  public forwardTensor(input: tf.Tensor | NetInput): tf.Tensor2D {
+  public forwardInput(input: NetInput): tf.Tensor2D {
    const params = this._params

    if (!params) {
@@ -50,7 +49,7 @@ export class FaceLandmarkNet {
    }

    return tf.tidy(() => {
-      const { batchTensor, batchInfo } = toInputTensor(input, 128, true)
+      const batchTensor = input.toBatchTensor(128, true)

      let out = conv(batchTensor, params.conv0_params)
      out = maxPool(out)
@@ -79,22 +78,22 @@ export class FaceLandmarkNet {
      */

      const landmarkTensors = fc1
-        .mul(tf.stack(batchInfo.map(info =>
+        .mul(tf.stack(Array.from(Array(input.batchSize), (_, batchIdx) =>
          createInterleavedTensor(
-            info.paddingX + info.originalWidth,
-            info.paddingY + info.originalHeight
+            input.getPaddings(batchIdx).x + input.getInputWidth(batchIdx),
+            input.getPaddings(batchIdx).y + input.getInputHeight(batchIdx)
          )
        )))
-        .sub(tf.stack(batchInfo.map(info =>
+        .sub(tf.stack(Array.from(Array(input.batchSize), (_, batchIdx) =>
          createInterleavedTensor(
-            Math.floor(info.paddingX / 2),
-            Math.floor(info.paddingY / 2)
+            Math.floor(input.getPaddings(batchIdx).x / 2),
+            Math.floor(input.getPaddings(batchIdx).y / 2)
          )
        )))
-        .div(tf.stack(batchInfo.map(info =>
+        .div(tf.stack(Array.from(Array(input.batchSize), (_, batchIdx) =>
          createInterleavedTensor(
-            info.originalWidth,
-            info.originalHeight
+            input.getInputWidth(batchIdx),
+            input.getInputHeight(batchIdx)
          )
        )))

@@ -102,40 +101,33 @@ export class FaceLandmarkNet {
    })
  }

-  public async forward(input: tf.Tensor | NetInput | TNetInput): Promise<tf.Tensor2D> {
-    const netInput = input instanceof tf.Tensor
-      ? input
-      : await toNetInput(input)
-
-    return this.forwardTensor(netInput)
+  public async forward(input: TNetInput): Promise<tf.Tensor2D> {
+    return this.forwardInput(await toNetInput(input, true))
  }

-  public async detectLandmarks(input: tf.Tensor | NetInput | TNetInput): Promise<FaceLandmarks | FaceLandmarks[]> {
-    const netInput = input instanceof tf.Tensor
-      ? input
-      : await toNetInput(input)
+  public async detectLandmarks(input: TNetInput): Promise<FaceLandmarks | FaceLandmarks[]> {
+    const netInput = await toNetInput(input, true)

-    const landmarkTensors = tf.unstack(this.forwardTensor(netInput))
+    const landmarkTensors = tf.unstack(this.forwardInput(netInput))

    const landmarksForBatch = await Promise.all(landmarkTensors.map(
      async (landmarkTensor, batchIdx) => {
        const landmarksArray = Array.from(await landmarkTensor.data())
-        landmarkTensor.dispose()
-
        const xCoords = landmarksArray.filter((_, i) => isEven(i))
        const yCoords = landmarksArray.filter((_, i) => !isEven(i))

-        const [height, width] = netInput instanceof tf.Tensor
-          ? netInput.shape.slice(1)
-          : [netInput.canvases[batchIdx].height, netInput.canvases[batchIdx].width]
-
        return new FaceLandmarks(
          Array(68).fill(0).map((_, i) => new Point(xCoords[i], yCoords[i])),
-          { height, width }
+          {
+            height: netInput.getInputHeight(batchIdx),
+            width : netInput.getInputWidth(batchIdx),
+          }
        )
      }
    ))

+    landmarkTensors.forEach(t => t.dispose())
+
    return landmarksForBatch.length === 1 ? landmarksForBatch[0] : landmarksForBatch
  }
 }
\ No newline at end of file
--- a/src/faceRecognitionNet/FaceRecognitionNet.ts
+++ b/src/faceRecognitionNet/FaceRecognitionNet.ts
 import * as tf from '@tensorflow/tfjs-core';

-import { getImageTensor } from '../commons/getImageTensor';
 import { NetInput } from '../NetInput';
-import { padToSquare } from '../padToSquare';
 import { toNetInput } from '../toNetInput';
 import { TNetInput } from '../types';
 import { convDown } from './convLayer';
@@ -32,25 +30,18 @@ export class FaceRecognitionNet {
    this._params = extractParams(weights)
  }

-  public async forward(input: tf.Tensor | NetInput | TNetInput): Promise<tf.Tensor2D> {
+  public async forwardInput(input: NetInput): Promise<tf.Tensor2D> {
    if (!this._params) {
      throw new Error('FaceRecognitionNet - load model before inference')
    }

-    const netInput = input instanceof tf.Tensor
-      ? input
-      : await toNetInput(input)

    return tf.tidy(() => {
+      const batchTensor = input.toBatchTensor(150, true)

-      let x = padToSquare(getImageTensor(netInput), true)
-      // work with 150 x 150 sized face images
-      if (x.shape[1] !== 150 || x.shape[2] !== 150) {
-        x = tf.image.resizeBilinear(x, [150, 150])
-      }
-      x = normalize(x)
+      const normalized = normalize(batchTensor)

-      let out = convDown(x, this._params.conv32_down)
+      let out = convDown(normalized, this._params.conv32_down)
      out = tf.maxPool(out, 3, 2, 'valid')

      out = residual(out, this._params.conv32_1)
@@ -77,13 +68,12 @@ export class FaceRecognitionNet {
      return fullyConnected
    })
  }
+  public async forward(input: TNetInput): Promise<tf.Tensor2D> {
+    return this.forwardInput(await toNetInput(input, true))
+  }

-  public async computeFaceDescriptor(input: tf.Tensor | NetInput | TNetInput) {
-    const netInput = input instanceof tf.Tensor
-      ? input
-      : await toNetInput(input)
-
-    const result = await this.forward(netInput)
+  public async computeFaceDescriptor(input: TNetInput) {
+    const result = await this.forward(await toNetInput(input, true))
    const data = await result.data()
    result.dispose()
    return data as Float32Array

--- a/src/globalApi.ts
+++ b/src/globalApi.ts
@@ -35,7 +35,7 @@ export function loadModels(url: string) {
 }

 export function locateFaces(
-  input: tf.Tensor | NetInput | TNetInput,
+  input: TNetInput,
  minConfidence?: number,
  maxResults?: number
 ): Promise<FaceDetection[]> {
@@ -43,13 +43,13 @@ export function locateFaces(
 }

 export function detectLandmarks(
-  input: tf.Tensor | NetInput | TNetInput
+  input: TNetInput
 ): Promise<FaceLandmarks | FaceLandmarks[]> {
  return landmarkNet.detectLandmarks(input)
 }

 export function computeFaceDescriptor(
-  input: tf.Tensor | NetInput | TNetInput
+  input: TNetInput
 ): Promise<Float32Array>  {
  return recognitionNet.computeFaceDescriptor(input)
 }

--- a/src/toNetInput.ts
+++ b/src/toNetInput.ts
+import * as tf from '@tensorflow/tfjs-core';
+
+import { isMediaElement } from './commons/isMediaElement';
+import { isTensor3D, isTensor4D } from './commons/isTensor';
 import { NetInput } from './NetInput';
 import { TNetInput } from './types';
-import { awaitMediaLoaded, getElement } from './utils';
+import { awaitMediaLoaded, resolveInput } from './utils';

 /**
 * Validates the input to make sure, they are valid net inputs and awaits all media elements
 * to be finished loading.
 *
 * @param input The input, which can be a media element or an array of different media elements.
+ * @param manageCreatedInput If a new NetInput instance is created from the inputs, this flag
+ * determines, whether to set the NetInput as managed or not.
 * @returns A NetInput instance, which can be passed into one of the neural networks.
 */
 export async function toNetInput(
-  input: NetInput | TNetInput
+  inputs: TNetInput,
+  manageCreatedInput: boolean = false
 ): Promise<NetInput> {
-  if (input instanceof NetInput) {
-    return input
+  if (inputs instanceof NetInput) {
+    return inputs
+  }
+
+  const afterCreate = (netInput: NetInput) => manageCreatedInput
+    ? netInput.managed()
+    : netInput
+
+  if (isTensor4D(inputs)) {
+    return afterCreate(new NetInput(inputs as tf.Tensor4D))
  }

-  const mediaArgArray = Array.isArray(input)
-      ? input
-      : [input]
+  let inputArgArray = Array.isArray(inputs)
+      ? inputs
+      : [inputs]

-  if (!mediaArgArray.length) {
+  if (!inputArgArray.length) {
    throw new Error('toNetInput - empty array passed as input')
  }

-  const medias = mediaArgArray.map(getElement)
+  const getIdxHint = (idx: number) => Array.isArray(inputs) ? ` at input index ${idx}:` : ''

-  medias.forEach((media, i) => {
-    if (!(media instanceof HTMLImageElement || media instanceof HTMLVideoElement || media instanceof HTMLCanvasElement)) {
-      const idxHint = Array.isArray(input) ? ` at input index ${i}:` : ''
-      if (typeof mediaArgArray[i] === 'string') {
-        throw new Error(`toNetInput -${idxHint} string passed, but could not resolve HTMLElement for element id`)
+  const inputArray = inputArgArray
+    .map(resolveInput)
+    .map((input, i) => {
+      if (isTensor4D(input)) {
+        // if tf.Tensor4D is passed in the input array, the batch size has to be 1
+        const batchSize = input.shape[0]
+        if (batchSize !== 1) {
+          throw new Error(`toNetInput -${getIdxHint(i)} tf.Tensor4D with batchSize ${batchSize} passed, but not supported in input array`)
+        }
+        // to tf.Tensor3D
+        return input.reshape(input.shape.slice(1))
      }
-      throw new Error(`toNetInput -${idxHint} expected media to be of type HTMLImageElement | HTMLVideoElement | HTMLCanvasElement, or to be an element id`)
+      return input
+    })
+
+  inputArray.forEach((input, i) => {
+    if (!isMediaElement(input) && !isTensor3D(input)) {
+
+      if (typeof inputArgArray[i] === 'string') {
+        throw new Error(`toNetInput -${getIdxHint(i)} string passed, but could not resolve HTMLElement for element id ${inputArgArray[i]}`)
+      }
+
+      throw new Error(`toNetInput -${getIdxHint(i)} expected media to be of type HTMLImageElement | HTMLVideoElement | HTMLCanvasElement | tf.Tensor3D, or to be an element id`)
    }
  })

  // wait for all media elements being loaded
  await Promise.all(
-    medias.map(media => awaitMediaLoaded(media))
+    inputArray.map(input => isMediaElement(input) && awaitMediaLoaded(input))
  )

-  return new NetInput(medias)
+  return afterCreate(new NetInput(inputArray))
 }
\ No newline at end of file
--- a/src/types.ts
+++ b/src/types.ts
+import * as tf from '@tensorflow/tfjs-core';
+
+import { NetInput } from './NetInput';
+
 export type TMediaElement = HTMLImageElement | HTMLVideoElement | HTMLCanvasElement

-export type TNetInputArg = string | TMediaElement
+export type TResolvedNetInput = TMediaElement | tf.Tensor3D | tf.Tensor4D
+
+export type TNetInputArg = string | TResolvedNetInput

-export type TNetInput = TNetInputArg | Array<TNetInputArg>
+export type TNetInput = TNetInputArg | Array<TNetInputArg> | NetInput | tf.Tensor4D

 export type Dimensions = {
  width: number

--- a/src/utils.ts
+++ b/src/utils.ts
 import * as tf from '@tensorflow/tfjs-core';

+import { isTensor4D } from './commons/isTensor';
 import { Dimensions } from './types';

 export function isFloat(num: number) {
@@ -14,7 +15,7 @@ export function round(num: number) {
  return Math.floor(num * 100) / 100
 }

-export function getElement(arg: string | any) {
+export function resolveInput(arg: string | any) {
  if (typeof arg === 'string') {
    return document.getElementById(arg)
  }
@@ -106,12 +107,12 @@ export function bufferToImage(buf: Blob): Promise<HTMLImageElement> {
 }

 export async function imageTensorToCanvas(
-  imgTensor: tf.Tensor4D,
+  imgTensor: tf.Tensor,
  canvas?: HTMLCanvasElement
 ): Promise<HTMLCanvasElement> {
  const targetCanvas = canvas ||  document.createElement('canvas')

-  const [_, height, width, numChannels] = imgTensor.shape
+  const [height, width, numChannels] = imgTensor.shape.slice(isTensor4D(imgTensor) ? 1 : 0)
  await tf.toPixels(imgTensor.as3D(height, width, numChannels).toInt(), targetCanvas)

  return targetCanvas