kunlun/export/inc/cnn/cnn_tensor_util.h

/****************************************************************************

Copyright(c) 2019 by Aerospace C.Power (Chongqing) Microelectronics. ALL RIGHTS RESERVED.

This Information is proprietary to Aerospace C.Power (Chongqing) Microelectronics and MAY NOT
be copied by any method or incorporated into another program without
the express written consent of Aerospace C.Power. This Information or any portion
thereof remains the property of Aerospace C.Power. The Information contained herein
is believed to be accurate and Aerospace C.Power assumes no responsibility or
liability for its use in any way and conveys no license or title under
any patent or copyright and makes no representation or warranty that this
Information is free from patent or copyright infringement.

****************************************************************************/
#ifndef CNN_TENSOR_UTIL_H
#define CNN_TENSOR_UTIL_H

/* os shim includes */
#include "os_types_api.h"

#ifdef __cplusplus
extern "C" {
#endif

/** \defgroup cnn tensor util APIs
  * @brief cnn tensor util APIs
  * These functions could be used for tensor calculation, generally, CNN engine will not be used.
  * Arranged for cnn engine means every column will occupy a size which is a multiple of 32 bytes. Here column means the last dimension of a tensor
  * For example, if a tensor of type int8_t named A is of size 3 * 4 * 30 * 300, it will be saved in the memory of this form so that every column occupies 320 bytes:
  * A[0][0][0][0], A[0][0][0][1], ..., A[0][0][0][299], 0, 0, 0, ..., 0, A[0][0][1][0], A[0][0][1][1], ..., A[0][0][1][299], 0, 0, 0, 0, ..., 0, ..., A[0][0][29][0], ..., A[0][0][29][299], 0, ..., 0, A[0][1][0][0], ...
  *
  *
  *
  */

/** @addtogroup cnn_tensor_util_APIs
  * @{
  */

/* @brief cnn_arrange_weight_for_engine_8bit() - arrange weight for cnn engine to calculate a 1-group conv
 * @param origin: where the weight, a 4-D tensor, is put
 * @param destin: where to put the arranged tensor, and bias, out_shift_number and bias_shift_number will also be merged into this tensor for cnn engine
 * @param bias: where the bias for conv is put, if bias is not needed, please set this pointer to NULL
 * @param out_shift_number: where the shift numbers for output of acc are put, if this is not needed(i.e. all the output of acc will shift with the same number), please set this pointer to NULL, this will not be used unless bias_shift_number is not NULL
 * @param bias_shift_number: where the shift numbers for bias are put, if this is not needed(i.e. all the bias will shift with the same number and be added to acc), please set this pointer to NULL, this will not be used unless out_shift_number is not NULL
 * @param out_channel_per_weave: should be equal to the number of rows of acc used in conv
 * @param filter_size: the size of filter
 * @param in_channel: the number of channel of input
 * @param out_channel: the number of channel of output
 * @param h_w_cin_cout: non-zero - the origin weight is arranged as h * w * c_out * c_in, zero - the origin weight is arranged as c_out * c_in * h * w
 * @return 0 -- succeed
 * @return 1 -- filter_size is 0
 * @return 2 -- in_channel is 0
 * @return 3 -- out_channel is 0
 * @return 4 -- out_channel_per_weave is 0
 */
uint8_t cnn_arrange_weight_for_engine_8bit(int8_t *origin, int8_t *destin,
    int8_t *bias, uint8_t *out_shift_number, uint8_t *bias_shift_number,
    uint8_t out_channel_per_weave, int8_t filter_size, uint16_t in_channel,
    uint16_t out_channel, int8_t h_w_cin_cout);

/* @brief cnn_arrange_weight_for_engine_16bit() - arrange weight for cnn engine to calculate a 1-group conv
 * @param origin: where the weight, a 4-D tensor, is put
 * @param destin: where to put the arranged tensor, and bias, out_shift_number and bias_shift_number will also be merged into this tensor for cnn engine
 * @param bias: where the bias for conv is put, if bias is not needed, please set this pointer to NULL
 * @param out_shift_number: where the shift numbers for output of acc are put, if this is not needed(i.e. all the output of acc will shift with the same number), please set this pointer to NULL, this will not be used unless bias_shift_number is not NULL
 * @param bias_shift_number: where the shift numbers for bias are put, if this is not needed(i.e. all the bias will shift with the same number and be added to acc), please set this pointer to NULL, this will not be used unless out_shift_number is not NULL
 * @param out_channel_per_weave: should be equal to the number of rows of acc used in conv
 * @param filter_size: the size of filter
 * @param in_channel: the number of channel of input
 * @param out_channel: the number of channel of output
 * @param h_w_cin_cout: non-zero - the origin weight is arranged as h * w * c_out * c_in, zero - the origin weight is arranged as c_out * c_in * h * w
 * @return 0 -- succeed
 * @return 1 -- filter_size is 0
 * @return 2 -- in_channel is 0
 * @return 3 -- out_channel is 0
 * @return 4 -- out_channel_per_weave is 0
 */
uint8_t cnn_arrange_weight_for_engine_16bit(int16_t *origin, int16_t *destin,
    int16_t *bias, uint8_t *out_shift_number, uint8_t *bias_shift_number,
    uint8_t out_channel_per_weave, int8_t filter_size, uint16_t in_channel,
    uint16_t out_channel, int8_t h_w_cin_cout);

/* @brief cnn_arrange_weight_depthwise_for_engine_8bit() - arrange weight for cnn engine to calculate a 1-group conv
 * @param origin: where the weight, a 4-D tensor, is put
 * @param destin: where to put the arranged tensor, and bias, out_shift_number and bias_shift_number will also be merged into this tensor for cnn engine
 * @param bias: where the bias for conv is put, if bias is not needed, please set this pointer to NULL
 * @param out_shift_number: where the shift numbers for output of acc are put, if this is not needed(i.e. all the output of acc will shift with the same number), please set this pointer to NULL, this will not be used unless bias_shift_number is not NULL
 * @param bias_shift_number: where the shift numbers for bias are put, if this is not needed(i.e. all the bias will shift with the same number and be added to acc), please set this pointer to NULL, this will not be used unless out_shift_number is not NULL
 * @param filter_size: the size of filter
 * @param channel: the number of channel
 * @param h_w_cin_cout: non-zero - the origin weight is arranged as h * w * c_out * c_in, zero - the origin weight is arranged as c_out * c_in * h * w
 * @return 0 -- succeed
 * @return 1 -- filter_size is 0
 * @return 2 -- channel is 0
 */
uint8_t cnn_arrange_weight_depthwise_for_engine_8bit(int8_t *origin,
    int8_t *destin, int8_t *bias, uint8_t *out_shift_number,
    uint8_t *bias_shift_number, int8_t filter_size, uint16_t channel,
    int8_t h_w_cin_cout);

/* @brief cnn_arrange_weight_depthwise_for_engine_16bit() - arrange weight for cnn engine to calculate a 1-group conv
 * @param origin: where the weight, a 4-D tensor, is put
 * @param destin: where to put the arranged tensor, and bias, out_shift_number and bias_shift_number will also be merged into this tensor for cnn engine
 * @param bias: where the bias for conv is put, if bias is not needed, please set this pointer to NULL
 * @param out_shift_number: where the shift numbers for output of acc are put, if this is not needed(i.e. all the output of acc will shift with the same number), please set this pointer to NULL, this will not be used unless bias_shift_number is not NULL
 * @param bias_shift_number: where the shift numbers for bias are put, if this is not needed(i.e. all the bias will shift with the same number and be added to acc), please set this pointer to NULL, this will not be used unless out_shift_number is not NULL
 * @param filter_size: the size of filter
 * @param channel: the number of channel
 * @param h_w_cin_cout: non-zero - the origin weight is arranged as h * w * c_out * c_in, zero - the origin weight is arranged as c_out * c_in * h * w
 * @return 0 -- succeed
 * @return 1 -- filter_size is 0
 * @return 2 -- channel is 0
 */
uint8_t cnn_arrange_weight_depthwise_for_engine_16bit(int16_t *origin,
    int16_t *destin, int16_t *bias, uint8_t *out_shift_number,
    uint8_t *bias_shift_number, int8_t filter_size, uint16_t channel,
    int8_t h_w_cin_cout);

/* @brief cnn_arrange_bias_for_engine_8bits() - arrange bias for cnn engine to calculate a bias add oepration
 * @param origin: where the bias, a vector, is put
 * @param destin: where to put the arranged vector
 * @param length: the length of the vector
 * @return 0 -- succeed
 * @return 1 -- length is 0
 */
uint8_t cnn_arrange_bias_for_engine_8bits(int8_t *origin, int8_t *destin,
    uint32_t length);

/* @brief cnn_arrange_bias_for_engine_8bits() - arrange bias for cnn engine to calculate a bias add oepration
 * @param origin: where the bias, a vector, is put
 * @param destin: where to put the arranged vector
 * @param length: the length of the vector
 * @return 0 -- succeed
 * @return 1 -- length is 0
 */
uint8_t cnn_arrange_bias_for_engine_16bits(int16_t *origin, int16_t *destin,
    uint32_t length);

/* @brief cnn_get_shift_number_by_frac() - calculate the shift number of the output of acc and the shift number of bias before bias is added to acc, for cnn engine to calculate conv
 * @param frac_bias: where frac of bias of each channel is put
 * @param frac_weight: where the frac of weight of each output channel is put
 * @param frac_in: the frac of input of a conv calculation
 * @param frac_out: the frac of output of a conv calculation
 * @param out_shift_number: where to the shift number of the output of acc
 * @param bias_shift_number: where to the shift number of bias
 * @param out_channel: the channel of output, this should be equal to the length of frac_bias, frac_weight, out_shift_number, bias_shift_number
 * @return 0 -- succeed
 * @return 1 -- out_channel is 0
 */
uint8_t cnn_get_shift_number_by_frac(int8_t *frac_bias, int8_t *frac_weight,
    int8_t frac_in, int8_t frac_out, int8_t *out_shift_number,
    int8_t *bias_shift_number, uint32_t out_channel);

/* @brief cnn_align_feature_8bits() - re-arrange the successively arranged feature to the form for cnn engine
 * @param origin: where the origin feature is put
 * @param destin: where to put re-arranged feature
 * @param channel: the channel of this feature
 * @param height: the height of this feature
 * @param width: the width of this feature
 * @return 0 -- succeed
 * @return 1 -- channel is 0
 * @return 2 -- height is 0
 * @return 3 -- width is 0
 */
uint8_t cnn_align_feature_8bits(int8_t *origin, int8_t *destin,
    uint32_t channel, uint32_t height, uint32_t width);

/* @brief cnn_align_feature_16bits() - re-arrange the successively arranged feature to the form for cnn engine
 * @param origin: where the origin feature is put
 * @param destin: where to put re-arranged feature
 * @param channel: the channel of this feature
 * @param height: the height of this feature
 * @param width: the width of this feature
 * @return 0 -- succeed
 * @return 1 -- channel is 0
 * @return 2 -- height is 0
 * @return 3 -- width is 0
 */
uint8_t cnn_align_feature_16bits(int16_t *origin, int16_t *destin,
    uint32_t channel, uint32_t height, uint32_t width);

/* @brief cnn_unalign_feature_8bits() - re-arrange the feature from the form for cnn engine to successively arranged
 * @param origin: where the origin feature is put
 * @param destin: where to put re-arranged feature
 * @param channel: the channel of this feature
 * @param height: the height of this feature
 * @param width: the width of this feature
 * @return 0 -- succeed
 * @return 1 -- channel is 0
 * @return 2 -- height is 0
 * @return 3 -- width is 0
 */
uint8_t cnn_unalign_feature_8bits(int8_t *origin, int8_t *destin,
    uint32_t channel, uint32_t height, uint32_t width);

/* @brief cnn_unalign_feature_16bits() - re-arrange the feature from the form for cnn engine to successively arranged
 * @param origin: where the origin feature is put
 * @param destin: where to put re-arranged feature
 * @param channel: the channel of this feature
 * @param height: the height of this feature
 * @param width: the width of this feature
 * @return 0 -- succeed
 * @return 1 -- channel is 0
 * @return 2 -- height is 0
 * @return 3 -- width is 0
 */
uint8_t cnn_unalign_feature_16bits(int16_t *origin, int16_t *destin,
    uint32_t channel, uint32_t height, uint32_t width);

/* @brief cnn_tensor_add() - add 2 tensor using cnn engine
 * @param tensor1_addr: where the first tensor is put, the addr should be an integer multiple of 256
 * @param tensor2_addr: where the second tensor is put, the addr should be an integer multiple of 256
 * @param out_addr: where the output is put, the addr should be an integer multiple of 256
 * @param rank: the rank of the tensor
 * @param dim: the dimensions of the tensor
 * @param signed_: 0 - the tensors are unsigned, others - the tensors are signed
 * @param tensor_8bits: 0 - the tensors are encoded to int16 or uint16, others - the tensors are encoded to int8 or uint8
 * @param right_shift: the right shift number of the sum, this is used for avoiding overflow
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @param use_cnn: if you want to use CNN engine to complete the calculation
 * @return 0 -- succeed and CNN engine is used in the calculation
 * @return 1 -- rank < 2
 * @return 2 -- input along one common dimension is 0
 * @return 255 -- succeed and CNN engine is not used in the calculation
 */
uint8_t cnn_tensor_add(uint32_t tensor1_addr, uint32_t tensor2_addr,
    uint32_t out_addr, uint32_t rank, uint16_t *dim, uint8_t signed_,
    uint8_t tensor_8bits, uint8_t right_shift, uint8_t in_for_cnn,
    uint8_t out_for_cnn, uint8_t use_cnn);

/* @brief cnn_tensor_sub() - calculate tensor1 minus tensor2
 * @param tensor1_addr: where the first tensor is put, the addr should be an integer multiple of 256
 * @param tensor2_addr: where the second tensor is put, the addr should be an integer multiple of 256
 * @param out_addr: where the output is put, the addr should be an integer multiple of 256
 * @param rank: the rank of the tensor
 * @param dim: the dimensions of the tensor
 * @param signed_: 0 - the tensors are unsigned, others - the tensors are signed, no matter if the signed_ is 0, the output will be signed number
 * @param tensor_8bits: 0 - the tensors are encoded to int16 or uint16, others - the tensors are encoded to int8 or uint8
 * @param right_shift: the right shift number of the sum, this is used for avoiding overflow
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @param use_cnn: if you want to use CNN engine to complete the calculation
 * @return 0 -- succeed and CNN engine is used in the calculation
 * @return 1 -- rank < 2
 * @return 2 -- input along one common dimension is 0
 * @return 255 -- succeed and CNN engine is not used in the calculation
 */
uint8_t cnn_tensor_sub(uint32_t tensor1_addr, uint32_t tensor2_addr,
    uint32_t out_addr, uint32_t rank, uint16_t *dim, uint8_t signed_,
    uint8_t tensor_8bits, uint8_t right_shift, uint8_t in_for_cnn,
    uint8_t out_for_cnn, uint8_t use_cnn);

/* @brief cnn_tensor_mul_8bit() - mutiply 2 tensors elementwisely, if an element of the result is out of range of -128~127, it will be set to -128 or 127
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param right_shift: the right shift number when set the result to the output, it could be used for avoid saturation
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_mul_8bit(int8_t *input1, int8_t *input2, uint8_t rank,
    uint16_t *dim, int8_t *output, uint8_t right_shift,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_mul_16bit() - mutiply 2 tensors elementwisely, if an element of the result is out of range of -65536~65535, it will be set to -65536 or 65535
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param right_shift: the right shift number when set the result to the output, it could be used for avoid saturation
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_mul_16bit(int16_t *input1, int16_t *input2, uint8_t rank,
    uint16_t *dim, int16_t *output, uint8_t right_shift,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_div_8bit() - divide the 1st tensor by the 2nd tensor elementwisely, if an element of the result is out of range of -128~127, it will be set to -128 or 127
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param left_shift: the left shift number of 1st tensor when calculating division, it could be used for a higher precision
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_div_8bit(int8_t *input1, int8_t *input2, uint8_t rank,
    uint16_t *dim, int8_t *output, uint8_t left_shift,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_div_16bit() - divide the 1st tensor by the 2nd tensor elementwisely, if an element of the result is out of range of -65536~65535, it will be set to -65536 or 65535
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param left_shift: the left shift number of 1st tensor when calculating division, it could be used for a higher precision
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_div_16bit(int16_t *input1, int16_t *input2, uint8_t rank,
    uint16_t *dim, int16_t *output, uint8_t left_shift,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_mod_8bit() - mode the 1st tensor by the 2nd tensor elementwisely
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_mod_8bit(int8_t *input1, int8_t *input2, uint8_t rank,
    uint16_t *dim, int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_mod_16bit() - mode the 1st tensor by the 2nd tensor elementwisely
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_mod_16bit(int16_t *input1, int16_t *input2, uint8_t rank,
    uint16_t *dim, int16_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_split_8bit() - split a tensor to tensors, along one dimension
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the tensor
 * @param num_split: the number to split, for example, if it is 2, the input tensor will be split into 2 tensors, need to evenly divide the dimension along which the input is split
 * @param out: where to put the split result
 * @param split_dim: the dimension along which to split the input tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank is 0
 * @return 2 -- split_dim is out of the range of [0, rank)
 * @return 3 -- num_split < 2
 * @return 4 -- the output's dimension along which to split the input tensor is 0
 * @return 5 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_split_8bit(int8_t *input, uint8_t rank, uint16_t *dim,
    uint8_t num_split, int8_t **out, uint8_t split_dim,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_split_16bit() - split a tensor to tensors, along one dimension
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the tensor
 * @param num_split: the number to split, for example, if it is 2, the input tensor will be split into 2 tensors, need to evenly divide the dimension along which the input is split
 * @param out: where to put the split result
 * @param split_dim: the dimension along which to split the input tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank is 0
 * @return 2 -- split_dim is out of the range of [0, rank)
 * @return 3 -- num_split < 2
 * @return 4 -- the output's dimension along which to split the input tensor is 0
 * @return 5 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_split_16bit(int16_t *input, uint8_t rank, uint16_t *dim,
    uint8_t num_split, int16_t **out, uint8_t split_dim,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_slice_8bit() - extract a slice from a tensor
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the input tensor
 * @param begin: the start location of extraction
 * @param size: the size of the extraction on every dimension
 * @param output: where to put the output
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- extraction size along one dimension is 0
 * @return 3 -- input along one dimension is 0
 * @return 4 -- along one dimension, begin index plus size is larger than input(i.e. extracted element index is out of the range of the input)
 */
uint8_t cnn_tensor_slice_8bit(int8_t *input, uint8_t rank, uint16_t *dim,
    uint16_t *begin, uint16_t *size, int8_t *output,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_slice_8bit() - extract a slice from a tensor
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the input tensor
 * @param begin: the start location of extraction
 * @param size: the size of the extraction on every dimension
 * @param output: where to put the output
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- extraction size along one dimension is 0
 * @return 3 -- input along one dimension is 0
 * @return 4 -- along one dimension, begin index plus size is larger than input(i.e. extracted element index is out of the range of the input)
 */
uint8_t cnn_tensor_slice_16bit(int16_t *input, uint8_t rank, uint16_t *dim,
    uint16_t *begin, uint16_t *size, int16_t *output,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_tile_8bit() - construct a tensor by tiling a given tensor
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the input tensor
 * @param multi: the multiplier of tiling the given tensor
 * @param output: where to put the tiled tensor
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- the multiplier along one dimension is 0
 * @return 3 -- input along one dimension is 0
 */
uint8_t cnn_tensor_tile_8bit(int8_t *input, uint8_t rank, uint16_t *dim,
    uint16_t *multi, int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_tile_16bit() - construct a tensor by tiling a given tensor
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the input tensor
 * @param multi: the multiplier of tiling the given tensor
 * @param output: where to put the tiled tensor
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- the multiplier along one dimension is 0
 * @return 3 -- input along one dimension is 0
 */
uint8_t cnn_tensor_tile_16bit(int16_t *input, uint8_t rank, uint16_t *dim,
    uint16_t *multi, int16_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_pad_8bit() - pad a tensor with zeros
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the input tensor
 * @param pad: the number of zeros added before and after the contents of input, should be arranged in this form : [b_0, a_0, b_1, a_1, ..., b_rank-1, a_rank-1], where b_n is the number of zeros added before the contents of input along n-th dimension, a_n is the number of zeros added after the contents of input along n-th dimension
 * @param output: where to put the tiled tensor
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- input along one dimension is 0
 */
uint8_t cnn_tensor_pad_8bit(int8_t *input, uint8_t rank, uint16_t *dim,
    uint16_t *pad, int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_pad_16bit() - pad a tensor with zeros
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the input tensor
 * @param pad: the number of zeros added before and after the contents of input, should be arranged in this form : [b_0, a_0, b_1, a_1, ..., b_rank-1, a_rank-1], where b_n is the number of zeros added before the contents of input along n-th dimension, a_n is the number of zeros added after the contents of input along n-th dimension
 * @param output: where to put the tiled tensor
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- input along one dimension is 0
 */
uint8_t cnn_tensor_pad_16bit(int16_t *input, uint8_t rank, uint16_t *dim,
    uint16_t *pad, int16_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_concat_8bit() - concatenates tensors along one dimension
 * @param input: where the input tensors are put
 * @param rank: the rank of the input tensors
 * @param common_dim: the other dimensions of the input tensors, the size of the dimension along which to concatenate should not be here
 * @param number_input: the number of input tensors
 * @param size_along_cat_dim: the sizes of input tensors on the dimension along which to concatenate
 * @param dim_cat: dimension along which to concatenate, for example, to concatenate along the first dimension, this should to 0
 * @param output: where to put the concatenated tensor
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- input along one common dimension is 0
 * @return 3 -- number_input < 2
 * @return 4 -- one of the sizes on the dimension along which to concatenate is 0
 * @return 5 -- dim_cat >= rank
 */
uint8_t cnn_tensor_concat_8bit(int8_t **input, uint8_t rank,
    uint16_t *common_dim, uint16_t number_input, uint16_t *size_along_cat_dim,
    uint8_t dim_cat, int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_concat_16bit() - concatenates tensors along one dimension
 * @param input: where the input tensors are put
 * @param rank: the rank of the input tensors
 * @param common_dim: the other dimensions of the input tensors, the size of the dimension along which to concatenate should not be here
 * @param number_input: the number of input tensors
 * @param size_along_cat_dim: the sizes of input tensors on the dimension along which to concatenate
 * @param dim_cat: dimension along which to concatenate, for example, to concatenate along the first dimension, this should to 0
 * @param output: where to put the concatenated tensor
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- input along one common dimension is 0
 * @return 3 -- number_input < 2
 * @return 4 -- one of the sizes on the dimension along which to concatenate is 0
 * @return 5 -- dim_cat >= rank
 */
uint8_t cnn_tensor_concat_16bit(int16_t **input, uint8_t rank,
    uint16_t *common_dim, uint16_t number_input, uint16_t *size_along_cat_dim,
    uint8_t dim_cat, int16_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_transpose_8bit() - transpose a tensor
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the input tensor
 * @param perm: the indice of dimension according which to transpose the tensor, for example, given a tensor of size a[0] * a[1] * a[2] * ... * a[n], given perm =;p[0], p[1], p[2], ..., p[n]}, the output tensor is of size a[p[0]] * a[p[1]] * a[p[2]] * ... * a[p[n]]
 * @param output: where the output will be put
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- input along one common dimension is 0
 * @return 3 -- one of the numbers in perm is equal or larger than rank
 * @return 4 -- number repetition in perm
 */
uint32_t cnn_tensor_transpose_8bit(int8_t *input, uint8_t rank, uint16_t *dim,
    uint8_t *perm, int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_transpose_8bit() - transpose a tensor
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the input tensor
 * @param perm: the indice of dimension according which to transpose the tensor, for example, given a tensor of size a[0] * a[1] * a[2] * ... * a[n], given perm =;p[0], p[1], p[2], ..., p[n]}, the output tensor is of size a[p[0]] * a[p[1]] * a[p[2]] * ... * a[p[n]]
 * @param output: where the output will be put
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- input along one common dimension is 0
 * @return 3 -- one of the numbers in perm is equal or larger than rank
 * @return 4 -- number repetition in perm
 */
uint32_t cnn_tensor_transpose_16bit(int16_t *input, uint8_t rank, uint16_t *dim,
    uint8_t *perm, int16_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_channel_shuffle() - divide feature along channels into groups and shuffle the channels, ref:https://arxiv.org/pdf/1707.01083.pdf
 * @param input_addr: where the feature is put
 * @param channel: number of channels of the feature
 * @param height: height of the feature
 * @param width: width of the feature
 * @param group: how many groups the feature will be divided into
 * @param output_addr: where the shuffled feature will be put
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- channel is 0
 * @return 2 -- height is 0
 * @return 3 -- width is 0
 * @return 4 -- group is 0
 * @return 5 -- channel is smaller than group
 * @return 6 -- channel % group != 0
 */
uint8_t cnn_channel_shuffle(uint32_t input_addr, uint16_t channel,
    uint16_t height, uint16_t width, uint16_t group, uint32_t output_addr,
    uint8_t in_for_cnn, uint8_t out_for_cnn, uint32_t tensor_8bits);

/* @brief cnn_tensor_reverse_8bit() - reverse specific dimensions of a tensor
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the input tensor
 * @param reverse: to mark the dimensions to reverse, elements along i-th dimension will be reversed if and only if reverse[i] is non-zero, for example, to reverse a 6-D tensor along dimension 0, 3, 4, reverse should be;1, 0, 0, 1, 1, 0}
 * @param output_addr: where the shuffled feature will be put
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- succeed and CNN engine is used in the calculation
 * @return 1 -- rank < 2
 * @return 2 -- input along one common dimension is 0
 * @return 255 -- succeed and CNN engine is not used in the calculation
 */
uint8_t cnn_tensor_reverse_8bit(int8_t *input, uint8_t rank, uint16_t *dim,
    uint8_t *reverse, int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_reverse_16bit() - reverse specific dimensions of a tensor
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the input tensor
 * @param reverse: to mark the dimensions to reverse, elements along i-th dimension will be reversed if and only if reverse[i] is non-zero, for example, to reverse a 6-D tensor along dimension 0, 3, 4, reverse should be;1, 0, 0, 1, 1, 0}
 * @param output_addr: where the shuffled feature will be put
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- input along one common dimension is 0
 */
uint8_t cnn_tensor_reverse_16bit(int16_t *input, uint8_t rank, uint16_t *dim,
    uint8_t *reverse, int16_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_channel_reverse_16bit() - reverse specific dimensions of a tensor
 * @param int8_bit: 0: the tensor's type is int16, 1: the tensor's type is int8
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the input tensor
 * @param addr: where the shuffled feature will be put
 * @param for_cnn: non-zero - tensor is arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- input along one common dimension is 0
 */
uint8_t cnn_zero_tensor(uint8_t int8_bit, uint8_t rank, uint16_t *dim,
    uint32_t addr, uint8_t for_cnn);

/* @brief cnn_channel_reverse_16bit() - reverse specific dimensions of a tensor
 * @param int8_bit: 0: the tensor's type is int16, 1: the tensor's type is int8
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the input tensor
 * @param addr: where the shuffled feature will be put
 * @param for_cnn: non-zero - tensor is arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- input along one common dimension is 0
 */
uint8_t cnn_ones_tensor(uint8_t int8_bit, uint8_t rank, uint16_t *dim,
    uint32_t addr, uint8_t for_cnn);

/* @brief cnn_channel_reverse_16bit() - reverse specific dimensions of a tensor
 * @param int8_bit: 0: the tensor's type is int16, 1: the tensor's type is int8
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the input tensor
 * @param addr: where the shuffled feature will be put
 * @param for_cnn: non-zero - tensor is arranged for cnn engine
 * @param to_fill: the number to be filled in the tensor, if the tensor's type is int8, the lower 8 bit of to_fill will be used
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- input along one common dimension is 0
 */
uint8_t cnn_fill_tensor(uint8_t int8_bit, uint8_t rank, uint16_t *dim,
    uint32_t addr, uint8_t for_cnn, int16_t to_fill);

/* @brief cnn_tensor_pack_8bit() - pack a list of rank-R tensors into one rank-(R+1) tensor.
 * @param input: where the input tensors are put
 * @param rank: the rank of the input tensors
 * @param dim: the other dimensions of the input tensors, the size of the dimension along which to concatenate should not be here
 * @param number_input: the number of input tensors
 * @param output: where to put the concatenated tensor
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank = 0
 * @return 2 -- input along one common dimension is 0
 * @return 3 -- number_input < 2
 */
uint8_t cnn_tensor_pack_8bit(int8_t **input, uint8_t rank, uint16_t *dim,
    uint16_t number_input, int8_t *output,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_pack_16bit() - pack a list of rank-R tensors into one rank-(R+1) tensor.
 * @param input: where the input tensors are put
 * @param rank: the rank of the input tensors
 * @param dim: the other dimensions of the input tensors, the size of the dimension along which to concatenate should not be here
 * @param number_input: the number of input tensors
 * @param output: where to put the concatenated tensor
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank = 0
 * @return 2 -- input along one common dimension is 0
 * @return 3 -- number_input < 2
 */
uint8_t cnn_tensor_pack_16bit(int16_t **input, uint8_t rank, uint16_t *dim,
    uint16_t number_input, int16_t *output,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_unpack_8bit() - unpack the outer dimension of a rank-R tensor into rank-(R-1) tensors.
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the tensor
 * @param out: where to put the split result
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- one of the dimensions is 0
 * @return 3 -- size along first dimension is 1
 */
uint8_t cnn_tensor_unpack_8bit(int8_t *input, uint8_t rank, uint16_t *dim,
    int8_t **out, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_unpack_16bit() - unpack the outer dimension of a rank-R tensor into rank-(R-1) tensors.
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the tensor
 * @param out: where to put the split result
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- one of the dimensions is 0
 * @return 3 -- size along first dimension is 1
 */
uint8_t cnn_tensor_unpack_16bit(int16_t *input, uint8_t rank, uint16_t *dim,
    int16_t **out, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_reverse_sequence_8bit() - reverse variable length slices in given dimension. This op first slices input along the first dimension, and for each slice i, reverses the first reverse_length[i] elements along the dimension reverse_dim
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the tensor
 * @param output: where to put output tensor
 * @param reverse_length: the length to reverse
 * @param reverse_dim: the given dimension along which to reverse
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- one of the dimensions is 0
 * @return 3 -- reverse_dim is 0
 * @return 4 -- reverse_dim >= rank
 * @return 5 -- one of the reverse_length is larger than the size of the reversed dimension
 */
uint8_t cnn_tensor_reverse_sequence_8bit(int8_t *input, uint8_t rank,
    uint16_t *dim, int8_t *output, uint16_t *reverse_length,
    uint8_t reverse_dim, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_reverse_sequence_16bit() - reverse variable length slices in given dimension. This op first slices input along the first dimension, and for each slice i, reverses the first reverse_length[i] elements along the dimension reverse_dim
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the tensor
 * @param output: where to put output tensor
 * @param reverse_length: the length to reverse
 * @param reverse_dim: the given dimension along which to reverse
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- one of the dimensions is 0
 * @return 3 -- reverse_dim is 0
 * @return 4 -- reverse_dim >= rank
 * @return 5 -- one of the reverse_length is larger than the size of the reversed dimension
 */
uint8_t cnn_tensor_reverse_sequence_16bit(int16_t *input, uint8_t rank,
    uint16_t *dim, int16_t *output, uint16_t *reverse_length,
    uint8_t reverse_dim, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_gather_8bit() - gather slices from input according to the given indices, output[i,:,:,...:] = input[indice[i],:,:,...:]
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the tensor
 * @param output: where to put output tensor
 * @param length_indice: the length of incide
 * @param indice: the given indices
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- one of the dimensions is 0
 * @return 3 -- length_indice is 0
 * @return 4 -- indice out of range of input tensor
 */
uint8_t cnn_tensor_gather_8bit(int8_t *input, uint8_t rank, uint16_t *dim,
    int8_t *output, uint16_t length_indice, uint16_t *indice,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_gather_16bit() - gather slices from input according to the given indices, output[i,:,:,...:] = input[indice[i],:,:,...:]
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the tensor
 * @param output: where to put output tensor
 * @param length_indice: the length of incide
 * @param indice: the given indices
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank is 0
 * @return 2 -- one of the dimensions is 0
 * @return 3 -- length_indice is 0
 * @return 4 -- indice out of range of input tensor
 */
uint8_t cnn_tensor_gather_16bit(int16_t *input, uint8_t rank, uint16_t *dim,
    int16_t *output, uint16_t length_indice, uint16_t *indice,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_dynamic_partition_8bit() - partition input tensor into tensors using given indices
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the tensor
 * @param output: where to put output tensors
 * @param num_partition: the number of output tensors
 * @param length_partitions: the length of given indices
 * @param partitions: the given indices, a vector
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- one of the dimensions is 0
 * @return 3 -- num_partitioin is 0
 * @return 4 -- length of given indices out of range of input tensor
 */
uint8_t cnn_tensor_dynamic_partition_8bit(int8_t *input, uint8_t rank,
    uint16_t *dim, int8_t **output, uint16_t num_partition,
    uint16_t length_partitions, uint16_t *partitions,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_dynamic_partition_16bit() - partition input tensor into tensors using given indices
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the tensor
 * @param output: where to put output tensors
 * @param num_partition: the number of output tensors
 * @param length_partitions: the length of given indices
 * @param partitions: the given indices, a vector
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- one of the dimensions is 0
 * @return 3 -- num_partitioin is 0
 * @return 4 -- length of given indices out of range of input tensor
 */
uint8_t cnn_tensor_dynamic_partition_16bit(int16_t *input, uint8_t rank,
    uint16_t *dim, int16_t **output, uint16_t num_partition,
    uint16_t length_partitions, uint16_t *partitions,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_dynamic_stitch_8bit() - extract the slices from the input tensors into a single tensor, according to the given indice
 * @param input: the input tensors
 * @param num_input: the number of input tensors
 * @param rank: the rank of the input tensors
 * @param dim_from_2nd: the dimensions of the input tensors, the sizes along 1st dimension are not included
 * @param first_dim: the sizes along 1st dimension of the input tensors
 * @param output: where to put output tensor
 * @param num_indice: the number of indice of each input tensor
 * @param indice: the given indice for interleave the input tensors
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- one of the dimensions is 0
 * @return 3 -- one element of num_indice is larger than the size along 1st dim of the corresponding input tensor
 * @return 4 -- one index is larger than the total number of slices
 * @return 5 -- not all the indice in [0, total number of slices) are set a specific slice
 */
uint8_t cnn_tensor_dynamic_stitch_8bit(int8_t **input, uint16_t num_input,
    uint8_t rank, uint16_t *dim_from_2nd, uint16_t *first_dim,
    int8_t *output, uint16_t *num_indice, uint16_t **indice,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_dynamic_stitch_16bit() - extract the slices from the input tensors into a single tensor, according to the given indice
 * @param input: the input tensors
 * @param num_input: the number of input tensors
 * @param rank: the rank of the input tensors
 * @param dim_from_2nd: the dimensions of the input tensors, the sizes along 1st dimension are not included
 * @param first_dim: the sizes along 1st dimension of the input tensors
 * @param output: where to put output tensor
 * @param num_indice: the number of indice of each input tensor
 * @param indice: the given indice for interleave the input tensors
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 2
 * @return 2 -- one of the dimensions is 0
 * @return 3 -- one element of num_indice is larger than the size along 1st dim of the corresponding input tensor
 * @return 4 -- one index is larger than the total number of slices
 * @return 5 -- not all the indice in [0, total number of slices) are set a specific slice
 */
uint8_t cnn_tensor_dynamic_stitch_16bit(int16_t **input, uint16_t num_input,
    uint8_t rank, uint16_t *dim_from_2nd, uint16_t *first_dim, int16_t *output,
    uint16_t *num_indice, uint16_t **indice,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_logic_and_8bit() - calculate the logical and of 2 tensors elementwisely
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_logic_and_8bit(int8_t *input1, int8_t *input2, uint8_t rank,
    uint16_t *dim, int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_logic_and_16bit() - calculate the logical and of 2 tensors elementwisely
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_logic_and_16bit(int16_t *input1, int16_t *input2,
    uint8_t rank, uint16_t *dim, int16_t *output,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_logic_or_8bit() - calculate logical or of 2 tensors elementwisely
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_logic_or_8bit(int8_t *input1, int8_t *input2, uint8_t rank,
    uint16_t *dim, int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_logic_or_16bit() - calculate logical or of 2 tensors elementwisely
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_logic_or_16bit(int16_t *input1, int16_t *input2,
    uint8_t rank, uint16_t *dim, int16_t *output,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_logic_xor_8bit() - calculate logical xor of 2 tensors elementwisely
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_logic_xor_8bit(int8_t *input1, int8_t *input2, uint8_t rank,
    uint16_t *dim, int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_logic_xor_16bit() - calculate logical xor of 2 tensors elementwisely
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_logic_xor_16bit(int16_t *input1, int16_t *input2,
    uint8_t rank, uint16_t *dim, int16_t *output,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_logic_xnor_8bit() - calculate logical xnor of 2 tensors elementwisely
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_logic_xnor_8bit(int8_t *input1, int8_t *input2, uint8_t rank,
    uint16_t *dim, int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_logic_xnor_16bit() - calculate logical xnor of 2 tensors elementwisely
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_logic_xnor_16bit(int16_t *input1, int16_t *input2,
    uint8_t rank, uint16_t *dim, int16_t *output,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_logic_not_8bit() - calculate logical not of given tensor elementwisely
 * @param input: the input tensor
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_logic_not_8bit(int8_t *input, uint8_t rank, uint16_t *dim,
    int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_logic_not_16bit() - calculate logical not of given tensor elementwisely
 * @param input: the input tensor
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_logic_not_16bit(int16_t *input, uint8_t rank, uint16_t *dim,
    int16_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_equal_8bit() - return the truth value of (input1 == input2) element-wise, output has the same type as the input.
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_equal_8bit(int8_t *input1, int8_t *input2, uint8_t rank,
    uint16_t *dim, int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_equal_16bit() - return the truth value of (input1 == input2) element-wise, output has the same type as the input.
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_equal_16bit(int16_t *input1, int16_t *input2, uint8_t rank,
    uint16_t *dim, int16_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_not_equal_8bit() - return the truth value of (input1 != input2) element-wise, output has the same type as the input.
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_not_equal_8bit(int8_t *input1, int8_t *input2, uint8_t rank,
    uint16_t *dim, int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_not_equal_16bit() - return the truth value of (input1 != input2) element-wise, output has the same type as the input.
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_not_equal_16bit(int16_t *input1, int16_t *input2,
    uint8_t rank, uint16_t *dim, int16_t *output,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_less_8bit() - return the truth value of (input1 < input2) element-wise, output has the same type as the input.
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_less_8bit(int8_t *input1, int8_t *input2, uint8_t rank,
    uint16_t *dim, int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_less_16bit() - return the truth value of (input1 < input2) element-wise, output has the same type as the input.
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_less_16bit(int16_t *input1, int16_t *input2, uint8_t rank,
    uint16_t *dim, int16_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_less_or_equal_8bit() - return the truth value of (input1 <= input2) element-wise, output has the same type as the input.
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_less_or_equal_8bit(int8_t *input1, int8_t *input2,
    uint8_t rank, uint16_t *dim, int8_t *output,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_less_or_equal_16bit() - return the truth value of (input1 <= input2) element-wise, output has the same type as the input.
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_less_or_equal_16bit(int16_t *input1, int16_t *input2,
    uint8_t rank, uint16_t *dim, int16_t *output,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_greater_8bit() - return the truth value of (input1 > input2) element-wise, output has the same type as the input.
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_greater_8bit(int8_t *input1, int8_t *input2, uint8_t rank,
    uint16_t *dim, int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_greater_16bit() - return the truth value of (input1 > input2) element-wise, output has the same type as the input.
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_greater_16bit(int16_t *input1, int16_t *input2, uint8_t rank,
    uint16_t *dim, int16_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_greater_or_equal_8bit() - return the truth value of (input1 >= input2) element-wise, output has the same type as the input.
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_greater_or_equal_8bit(int8_t *input1, int8_t *input2,
    uint8_t rank, uint16_t *dim, int8_t *output,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_greater_or_equal_16bit() - return the truth value of (input1 >= input2) element-wise, output has the same type as the input.
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_greater_or_equal_16bit(int16_t *input1, int16_t *input2,
    uint8_t rank, uint16_t *dim, int16_t *output,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_select_8bit() - select elements from 2 tensors depending on the condition tensor
 * @param selected_when_true: the tensor whose elements will be selected when the corresponding elements of condition tensor is non-zero
 * @param selected_when_false: the tensor whose elements will be selected when the corresponding elements of condition tensor is zero
 * @param condition: the condition tensor
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_select_8bit(int8_t *selected_when_true,
    int8_t *selected_when_false, int8_t *condition, uint8_t rank, uint16_t *dim,
    int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_select_16bit() - select elements from 2 tensors depending on the condition tensor
 * @param selected_when_true: the tensor whose elements will be selected when the corresponding elements of condition tensor is non-zero
 * @param selected_when_false: the tensor whose elements will be selected when the corresponding elements of condition tensor is zero
 * @param condition: the condition tensor
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_select_16bit(int16_t *selected_when_true,
    int16_t *selected_when_false, int16_t *condition, uint8_t rank,
    uint16_t *dim, int16_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_where_8bit() - return locations of non-zero elements of the input tensor
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the input tensor
 * @param output: where to put output tensor
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- no non-zero element
 * @return -1 -- rank < 1
 * @return -2 -- input along one common dimension is 0
 * @return other positive number -- the number of non-zero elements
 */
int32_t cnn_tensor_where_8bit(int8_t *input, uint8_t rank, uint16_t *dim,
    uint16_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_where_16bit() - return locations of non-zero elements of the input tensor
 * @param input: the input tensor
 * @param rank: the rank of the input tensor
 * @param dim: the dimensions of the input tensor
 * @param output: where to put output tensor
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- no non-zero element
 * @return -1 -- rank < 1
 * @return -2 -- input along one common dimension is 0
 * @return other positive number -- the number of non-zero elements
 */
int32_t cnn_tensor_where_16bit(int16_t *input, uint8_t rank, uint16_t *dim,
    uint16_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_add_n_8bit() - add tensors elementwise to one tensor
 * @param input: the input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param right_shift: the right shift number when set the result to the output, it could be used for avoid saturation
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 * @return 3 -- number_input < 2
 */
uint8_t cnn_tensor_add_n_8bit(int8_t **input, uint8_t rank, uint16_t *dim,
    uint16_t number_input, uint8_t right_shift, int8_t *output,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_add_n_16bit() - add tensors elementwisely to one tensor
 * @param input: the input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param right_shift: the right shift number when set the result to the output, it could be used for avoid saturation
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 * @return 3 -- number_input < 2
 */
uint8_t cnn_tensor_add_n_16bit(int16_t **input, uint8_t rank, uint16_t *dim,
    uint16_t number_input, uint8_t right_shift, int16_t *output,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_abs() - calculate the absolute value of a tensor elementwisely
 * @param input_addr: the address where the input tensor is put
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: the address where to put output tensor
 * @param tensor_8bits: 0 -- the tensor is of type int16, other -- the tensor is of type int8
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @param use_cnn: if you want to use CNN engine to complete the calculation
 * @return 0 -- succeed and CNN engine is used
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 * @return 255 -- succeed and CNN engine is not used
 */
uint8_t cnn_tensor_abs(uint32_t input_addr, uint8_t rank, uint16_t *dim,
    uint32_t output_addr, uint8_t tensor_8bits,
    uint8_t in_for_cnn, uint8_t out_for_cnn, uint8_t use_cnn);

/* @brief cnn_tensor_neg() - calculate numerical negative value of a tensor elementwisely
 * @param input_addr: the address where the input tensor is put
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: the address where to put output tensor
 * @param tensor_8bits: 0 -- the tensor is of type int16, other -- the tensor is of type int8
 * @param in_for_cnn: non-zero - input is arranged for cnn engine
 * @param out_for_cnn: non-zero - output will be arranged for cnn engine
 * @param use_cnn: if you want to use CNN engine to complete the calculation
 * @return 0 -- succeed and CNN engine is used
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 * @return 255 -- succeed and CNN engine is not used
 */
uint8_t cnn_tensor_neg(uint32_t input_addr, uint8_t rank, uint16_t *dim,
    uint32_t output_addr, uint8_t tensor_8bits,
    uint8_t in_for_cnn, uint8_t out_for_cnn, uint8_t use_cnn);

/* @brief cnn_tensor_sign_8bit() - calculate element-wise indication of the sign of a number
                                   y = sign(x) = -1 if x < 0; 0 if x == 0; 1 if x > 0
 * @param input: the input tensor
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_sign_8bit(int8_t *input, uint8_t rank, uint16_t *dim,
    int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_sign_16bit() - calculate element-wise indication of the sign of a number
                                    y = sign(x) = -1 if x < 0; 0 if x == 0; 1 if x > 0
 * @param input: the input tensor
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_sign_16bit(int16_t *input, uint8_t rank, uint16_t *dim,
    int16_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_inv_8bit() - calculate the reciprocal of x element-wise.
 * @param input: the input tensor
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param left_shift: the left shift number of the result, it could be used for a higher precision
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_inv_8bit(int8_t *input, uint8_t rank, uint16_t *dim,
    int8_t *output, uint8_t left_shift,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_inv_16bit() - calculate the reciprocal of x element-wise.
 * @param input: the input tensor
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param left_shift: the left shift number of the result, it could be used for a higher precision
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_inv_16bit(int16_t *input, uint8_t rank, uint16_t *dim,
    int16_t *output, uint8_t left_shift,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_square_8bit() - calculate the square of x element-wise.
 * @param input: the input tensor
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param right_shift: the right shift number of the sum, this is used for avoiding overflow
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_square_8bit(int8_t *input, uint8_t rank, uint16_t *dim,
    int8_t *output, uint8_t right_shift,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_square_16bit() - calculate the square of x element-wise.
 * @param input: the input tensor
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param right_shift: the right shift number of the sum, this is used for avoiding overflow
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_square_16bit(int16_t *input, uint8_t rank, uint16_t *dim,
    int16_t *output, uint8_t right_shift,
    uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_max_8bit() - return the max value of input1 and input2 element-wise, output has the same type as the input.
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_max_8bit(int8_t *input1, int8_t *input2, uint8_t rank,
    uint16_t *dim, int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_max_16bit() - return the max value of input1 and input2 element-wise, output has the same type as the input.
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_max_16bit(int16_t *input1, int16_t *input2, uint8_t rank,
    uint16_t *dim, int16_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_min_8bit() - return the min value of input1 and input2 element-wise, output has the same type as the input.
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_min_8bit(int8_t *input1, int8_t *input2, uint8_t rank,
    uint16_t *dim, int8_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_min_16bit() - return the min value of input1 and input2 element-wise, output has the same type as the input.
 * @param input1: the 1st input tensors
 * @param input2: the 2nd input tensors
 * @param rank: the rank of the input tensors
 * @param dim: the dimensions of the input tensors
 * @param output: where to put output tensor
 * @param in_for_cnn: if the input tensor is arranged for cnn engine
 * @param out_for_cnn: if the output will be arranged for cnn engine
 * @return 0 -- succeed
 * @return 1 -- rank < 1
 * @return 2 -- one of the dimensions is 0
 */
uint8_t cnn_tensor_min_16bit(int16_t *input1, int16_t *input2, uint8_t rank,
    uint16_t *dim, int16_t *output, uint8_t in_for_cnn, uint8_t out_for_cnn);

/* @brief cnn_tensor_plus_minus128() - add 128 to a tensor whose elements are in range [-128, 127] or sub 128 to a tensor whose elements are in range [0, 255], this operation will be completed by CNN engine, therefore, the input tensor should be arranged for CNN engine.
 * @param input_addr: address of input tensor
 * @param output_addr: address of output tensor
 * @param channel: channel of input tensor
 * @param height: height of input tensor
 * @param width: width of input tensor
 * @return 0 -- succeed
 * @return 1 -- input_addr is not a multiple of 256
 * @return 2 -- output_addr is not a multiple of 256
 * @return 3 -- channel is 0
 * @return 4 -- height is 0
 * @return 5 -- width is 0
 */
uint8_t cnn_tensor_plus_minus128(uint32_t input_addr, uint32_t output_addr,
    uint32_t channel, uint32_t height, uint32_t width);
/**
  * @}
  */


#ifdef __cplusplus
}
#endif

#endif // !CNN_TENSOR_UTIL_