kunlun/dtest/bee_ai_test/nn_function.h

/****************************************************************************

Copyright(c) 2019 by Aerospace C.Power (Chongqing) Microelectronics. ALL RIGHTS RESERVED.

This Information is proprietary to Aerospace C.Power (Chongqing) Microelectronics and MAY NOT
be copied by any method or incorporated into another program without
the express written consent of Aerospace C.Power. This Information or any portion
thereof remains the property of Aerospace C.Power. The Information contained herein
is believed to be accurate and Aerospace C.Power assumes no responsibility or
liability for its use in any way and conveys no license or title under
any patent or copyright and makes no representation or warranty that this
Information is free from patent or copyright infringement.

****************************************************************************/
#ifndef NN_FUNCTION_H
#define NN_FUNCTION_H
/* os shim includes */
#include "os_types.h"

#ifdef __cplusplus
extern "C" {
#endif

/** \defgroup linear nn API
  * @brief linear nn APIs
  * Linear neural network functions, including FC(fully connect) and 1D conv
  */

/** @addtogroup linear_nn_APIs
  * @{
  */

struct conv1d_config_bean {
    uint32_t in_channel; // number of channel of input
    uint32_t in_length; // length of input
    uint32_t out_channel; // number of channel of output
    uint32_t out_length; // length of output, which will be set after conv calculation is completed
    uint32_t kernel_length; // length of conv kernel
    uint32_t stride; // stride of conv
    uint32_t dilation; // dilation of conv
    uint32_t padding_left; // padding on the left
    uint32_t padding_right; // padding on the right
    uint32_t group; // group of the conv
};

/* @brief fc_int8_to_int8_weight_8bit_batch() - fully connect operation as below:
          output[k, i] = (sum(input[k, j] * weight[i, j]) + bias[i]) >> output_right_shift
          where the sum is done on index j.
          saturation included, i.e. output elements out of range [-128, 127] will be set to -128 or 127
 * @param input: where the input, a matrix of size batch * in_length, is put
 *               input should be arranged as below, as an example, we set in_length = 28, batch = 6:
 *               in[0, 0], in[0, 1], in[0, 2], ..., in[0, 27], 0, 0, 0, 0
 *               in[1, 0], in[1, 1], in[1, 2], ..., in[1, 27], 0, 0, 0, 0
 *               ...
 *               in[5, 0], in[5, 1], in[5, 2], ..., in[5, 27], 0, 0, 0, 0
 *               zeros are padded here to guarantee the distance between in[i, j] and in[i + 1, j] is ceil(in_length / 8) * 8 bytes
 * @param output: where the output, a matrix of size batch * out_length, will be put
 *                output should be arranged as below, as an example, we set out_length = 28, batch = 6:
 *                out[0, 0], out[0, 1], out[0, 2], ..., out[0, 27], 0, 0, 0, 0,
 *                out[1, 0], out[1, 1], out[1, 2], ..., out[1, 27], 0, 0, 0, 0,
 *                ...
 *                out[5, 0], out[5, 1], out[5, 2], ..., out[5, 27], 0, 0, 0, 0,
 *                zeros are padded here to guarantee the distance between out[i, j] and out[i + 1, j] is ceil(out_length / 8) * 8 bytes
 * @param weight: where the weight, a matrix of size out_length * in_length, is put
 *                weight should be arranged in the form below, as an example, we set in_length = 28, out_length = 6
 *                w[0, 0], w[0, 1], w[0, 2], ..., w[0, 27], 0, 0, 0, 0,
 *                w[1, 0], w[1, 1], w[1, 2], ..., w[1, 27], 0, 0, 0, 0,
 *                ...
 *                w[5, 0], w[5, 1], w[5, 2], ..., w[5, 27], 0, 0, 0, 0,
 *                zeros are padded here to guarantee the distance between w[i, j] and w[i, j+1] in memory is ceil(in_length / 8) * 8 bytes
 * @param bias: where the bias is put
 * @param in_length: param shown above
 * @param out_length: param shown above
 * @param output_right_shift: the right shift number shown in the description of this function
 * @param batch: param shown above
 * @return 0 -- succeed
 * @return 1 -- in_length is 0
 * @return 2 -- out_length is 0
 * @return 3 -- batch is 0
 */
uint8_t fc_int8_to_int8_weight_8bit_batch(int8_t *input, int8_t *output,
    int8_t *weight, int32_t *bias, uint32_t in_length, uint32_t out_length,
    uint8_t right_shift, uint32_t batch);

/* @brief fc_int8_to_int8_weight_8bit_bias_shift_batch() - fully connect operation as below:
          output[k, i] = (sum(input[k, j] * weight[i, j]) + bias[i] << bias_left_shift) >> output_right_shift
          where the sum is done on index j.
          saturation included, i.e. output elements out of range [-128, 127] will be set to -128 or 127
 * @param input: where the input, a matrix of size batch * in_length, is put
 *               input should be arranged as below, as an example, we set in_length = 28, batch = 6:
 *               in[0, 0], in[0, 1], in[0, 2], ..., in[0, 27], 0, 0, 0, 0
 *               in[1, 0], in[1, 1], in[1, 2], ..., in[1, 27], 0, 0, 0, 0
 *               ...
 *               in[5, 0], in[5, 1], in[5, 2], ..., in[5, 27], 0, 0, 0, 0
 *               zeros are padded here to guarantee the distance between in[i, j] and in[i + 1, j] is ceil(in_length / 8) * 8 bytes
 * @param output: where the output, a matrix of size batch * out_length, will be put
 *                output should be arranged as below, as an example, we set out_length = 28, batch = 6:
 *                out[0, 0], out[0, 1], out[0, 2], ..., out[0, 27], 0, 0, 0, 0
 *                out[1, 0], out[1, 1], out[1, 2], ..., out[1, 27], 0, 0, 0, 0
 *                ...
 *                out[5, 0], out[5, 1], out[5, 2], ..., out[5, 27], 0, 0, 0, 0
 *                zeros are padded here to guarantee the distance between out[i, j] and out[i + 1, j] is ceil(out_length / 8) * 8 bytes
 * @param weight: where the weight, a matrix of size out_length * in_length, is put
 *                weight should be arranged in the form below, as an example, we set in_length = 28, out_length = 6
 *                w[0, 0], w[0, 1], w[0, 2], ..., w[0, 27], 0, 0, 0, 0,
 *                w[1, 0], w[1, 1], w[1, 2], ..., w[1, 27], 0, 0, 0, 0,
 *                ...
 *                w[5, 0], w[5, 1], w[5, 2], ..., w[5, 27], 0, 0, 0, 0,
 *                zeros are padded here to guarantee the distance between w[i, j] and w[i, j+1] in memory is ceil(in_length / 8) * 8 bytes
 * @param bias: where the bias is put
 * @param in_length: param shown above
 * @param out_length: param shown above
 * @param output_right_shift: the right shift number shown in the description of this function
 * @param bias_left_shift: the left shift number shown in the description of this function
 * @param batch: param shown above
 * @return 0 -- succeed
 * @return 1 -- in_length is 0
 * @return 2 -- out_length is 0
 * @return 3 -- batch is 0
 */
uint8_t fc_int8_to_int8_weight_8bit_bias_shift_batch(int8_t *input,
    int8_t *output, int8_t *weight, int8_t *bias,
    uint32_t in_length, uint32_t out_length, uint8_t right_shift,
    uint8_t bias_left_shift, uint32_t batch);

/* @brief fc_int16_to_int16_weight_16bit_batch() - fully connect operation as below:
          output[k, i] = (sum(input[k, j] * weight[i, j]) + bias[i]) >> output_right_shift
          where the sum is done on index j.
          saturation included, i.e. output elements out of range [-32768, 32767] will be set to -32768 or 32767
 * @param input: where the input, a matrix of size batch * in_length, is put
 *               input should be arranged as below, as an example, we set in_length = 14, batch = 6:
 *               in[0, 0], in[0, 1], in[0, 2], ..., in[0, 13], 0, 0
 *               in[1, 0], in[1, 1], in[1, 2], ..., in[1, 13], 0, 0
 *               ...
 *               in[5, 0], in[5, 1], in[5, 2], ..., in[5, 13], 0, 0
 *               zeros are padded here to guarantee the distance between in[i, j] and in[i + 1, j] is ceil(in_length / 4) * 8
 * @param output: where the output, a matrix of size batch * out_length, will be put
 *                output should be arranged as below, as an example, we set out_length = 14, batch = 6:
 *                out[0, 0], out[0, 1], out[0, 2], ..., out[0, 13], 0, 0,
 *                out[1, 0], out[1, 1], out[1, 2], ..., out[1, 13], 0, 0,
 *                ...
 *                out[5, 0], out[5, 1], out[5, 2], ..., out[5, 13], 0, 0,
 *                zeros are padded here to guarantee the distance between out[i, j] and out[i + 1, j] is ceil(out_length / 4) * 8
 * @param weight: where the weight, a matrix of size out_length * in_length, is put
 *               weight should be arranged as below, as an example, we set in_length = 14, out_length = 6:
 *               w[0, 0], w[0, 1], w[0, 2], ..., w[0, 13], 0, 0
 *               w[1, 0], w[1, 1], w[1, 2], ..., w[1, 13], 0, 0
 *               ...
 *               w[5, 0], w[5, 1], w[5, 2], ..., w[5, 13], 0, 0
 *               zeros are padded here to guarantee the distance between w[i, j] and w[i + 1, j] is ceil(in_length / 4) * 8
 * @param bias: where the bias is put
 * @param in_length: param shown above
 * @param out_length: param shown above
 * @param output_right_shift: the right shift number shown in the description of this function
 * @param batch: param shown above
 * @return 0 -- succeed
 * @return 1 -- in_length is 0
 * @return 2 -- out_length is 0
 * @return 3 -- batch is 0
 */
uint8_t fc_int16_to_int16_weight_16bit_batch(int16_t *input, int16_t *output,
    int16_t *weight, int64_t *bias, uint32_t in_length, uint32_t out_length,
    uint8_t right_shift, uint32_t batch);

/* @brief fc_int16_to_int16_weight_16bit_bias_shift_batch() - fully connect operation as below:
          output[k, i] = (sum(input[k, j] * weight[i, j]) + bias[i] << bias_left_shift) >> output_right_shift
          where the sum is done on index j.
          saturation included, i.e. output elements out of range [-32768, 32767] will be set to -32768 or 32767
 * @param input: where the input, a matrix of size batch * in_length, is put
 *               input should be arranged as below, as an example, we set in_length = 14, batch = 6:
 *               in[0, 0], in[0, 1], in[0, 2], ..., in[0, 13], 0, 0
 *               in[1, 0], in[1, 1], in[1, 2], ..., in[1, 13], 0, 0
 *               ...
 *               in[5, 0], in[5, 1], in[5, 2], ..., in[5, 13], 0, 0
 *               zeros are padded here to guarantee the distance between in[i, j] and in[i + 1, j] is ceil(in_length / 4) * 8
 * @param output: where the output, a matrix of size batch * out_length, will be put
 *                output should be arranged as below, as an example, we set out_length = 14, batch = 6:
 *                out[0, 0], out[0, 1], out[0, 2], ..., out[0, 13], 0, 0,
 *                out[1, 0], out[1, 1], out[1, 2], ..., out[1, 13], 0, 0,
 *                ...
 *                out[5, 0], out[5, 1], out[5, 2], ..., out[5, 13], 0, 0,
 *                zeros are padded here to guarantee the distance between out[i, j] and out[i + 1, j] is ceil(out_length / 4) * 8
 * @param weight: where the weight, a matrix of size out_length * in_length, is put
 *               weight should be arranged as below, as an example, we set in_length = 14, out_length = 6:
 *               w[0, 0], w[0, 1], w[0, 2], ..., w[0, 13], 0, 0
 *               w[1, 0], w[1, 1], w[1, 2], ..., w[1, 13], 0, 0
 *               ...
 *               w[5, 0], w[5, 1], w[5, 2], ..., w[5, 13], 0, 0
 *               zeros are padded here to guarantee the distance between w[i, j] and w[i + 1, j] is ceil(in_length / 4) * 8
 * @param bias: where the bias is put
 * @param in_length: param shown above
 * @param out_length: param shown above
 * @param output_right_shift: the right shift number shown in the description of this function
 * @param bias_left_shift: the left shift number shown in the description of this function
 * @param batch: param shown above
 * @return 0 -- succeed
 * @return 1 -- in_length is 0
 * @return 2 -- out_length is 0
 * @return 3 -- batch is 0
 */
uint8_t fc_int16_to_int16_weight_16bit_bias_shift_batch(int16_t *input,
    int16_t *output, int16_t *weight, int16_t *bias,
    uint32_t in_length, uint32_t out_length, uint8_t right_shift,
    uint8_t bias_left_shift, uint32_t batch);

/* @brief fc_depth_int8_to_int8_weight_8bit_batch() - "depth" fully connect operation as below:
          output[k, i] = (input[k, i] * weight[i] + bias[i]) >> output_right_shift
          saturation included, i.e. output elements out of range [-128, 127] will be set to -128 or 127
 * @param input: where the input, a matrix of size batch * length, is put
 *               input should be arranged as below, as an example, we set length = 14, batch = 6:
 *               in[0, 0], in[0, 1], in[0, 2], ..., in[0, 13], 0, 0
 *               in[1, 0], in[1, 1], in[1, 2], ..., in[1, 13], 0, 0
 *               ...
 *               in[5, 0], in[5, 1], in[5, 2], ..., in[5, 13], 0, 0
 *               zeros are padded here to guarantee the distance between in[i, j] and in[i + 1, j] is ceil(in_length / 8) * 8
 * @param output: where the output, a matrix of size batch * length, will be put
 *                output should be arranged as below, as an example, we set length = 14, batch = 6:
 *                out[0, 0], out[0, 1], out[0, 2], ..., out[0, 13], 0, 0,
 *                out[1, 0], out[1, 1], out[1, 2], ..., out[1, 13], 0, 0,
 *                ...
 *                out[5, 0], out[5, 1], out[5, 2], ..., out[5, 13], 0, 0,
 *                zeros are padded here to guarantee the distance between out[i, j] and out[i + 1, j] is ceil(out_length / 8) * 8
 * @param weight: where the weight is put
 * @param bias: where the bias is put
 * @param length: param shown above
 * @param output_right_shift: the right shift number shown in the description of this function
 * @param batch: param shown above
 * @return 0 -- succeed
 * @return 1 -- length is 0
 * @return 2 -- batch is 0
 */
uint8_t fc_depth_int8_to_int8_weight_8bit_batch(int8_t *input, int8_t *output,
    int8_t *weight, int32_t *bias, uint32_t length, uint8_t output_right_shift,
    uint32_t batch);

/* @brief fc_depth_int8_to_int8_weight_8bit_batch_bias_shift() - "depth" fully connect operation as below:
          output[k, i] = (input[k, i] * weight[i] + bias[i] << bias_left_shift) >> output_right_shift
          saturation included, i.e. output elements out of range [-128, 127] will be set to -128 or 127
 * @param input: where the input, a matrix of size batch * length, is put
 *               input should be arranged as below, as an example, we set length = 14, batch = 6:
 *               in[0, 0], in[0, 1], in[0, 2], ..., in[0, 13], 0, 0
 *               in[1, 0], in[1, 1], in[1, 2], ..., in[1, 13], 0, 0
 *               ...
 *               in[5, 0], in[5, 1], in[5, 2], ..., in[5, 13], 0, 0
 *               zeros are padded here to guarantee the distance between in[i, j] and in[i + 1, j] is ceil(in_length / 8) * 8
 * @param output: where the output, a matrix of size batch * length, will be put
 *                output should be arranged as below, as an example, we set length = 14, batch = 6:
 *                out[0, 0], out[0, 1], out[0, 2], ..., out[0, 13], 0, 0,
 *                out[1, 0], out[1, 1], out[1, 2], ..., out[1, 13], 0, 0,
 *                ...
 *                out[5, 0], out[5, 1], out[5, 2], ..., out[5, 13], 0, 0,
 *                zeros are padded here to guarantee the distance between out[i, j] and out[i + 1, j] is ceil(out_length / 8) * 8
 * @param weight: where the weight is put
 * @param bias: where the bias is put
 * @param length: param shown above
 * @param output_right_shift: the right shift number shown in the description of this function
 * @param bias_left_shift: the left shift number shown in the description of this function
 * @param batch: param shown above
 * @return 0 -- succeed
 * @return 1 -- length is 0
 * @return 2 -- batch is 0
 */
uint8_t fc_depth_int8_to_int8_weight_8bit_bias_shift_batch(int8_t *input,
    int8_t *output, int8_t *weight, int8_t *bias, uint32_t length,
    uint8_t output_right_shift, uint8_t bias_left_shift, uint32_t batch);

/* @brief fc_depth_int16_to_int16_weight_16bit_batch() - "depth" fully connect operation as below:
          output[k, i] = (input[k, i] * weight[i] + bias[i]) >> output_right_shift
          saturation included, i.e. output elements out of range [-32768, 32767] will be set to -32768 or 32767
 * @param input: where the input, a matrix of size batch * in_length, is put
 *               input should be arranged as below, as an example, we set in_length = 14, batch = 6:
 *               in[0, 0], in[0, 1], in[0, 2], ..., in[0, 13], 0, 0
 *               in[1, 0], in[1, 1], in[1, 2], ..., in[1, 13], 0, 0
 *               ...
 *               in[5, 0], in[5, 1], in[5, 2], ..., in[5, 13], 0, 0
 *               zeros are padded here to guarantee the distance between in[i, j] and in[i + 1, j] is ceil(in_length / 4) * 8
 * @param output: where the output, a matrix of size batch * out_length, will be put
 *                output should be arranged as below, as an example, we set out_length = 14, batch = 6:
 *                out[0, 0], out[0, 1], out[0, 2], ..., out[0, 13], 0, 0,
 *                out[1, 0], out[1, 1], out[1, 2], ..., out[1, 13], 0, 0,
 *                ...
 *                out[5, 0], out[5, 1], out[5, 2], ..., out[5, 13], 0, 0,
 *                zeros are padded here to guarantee the distance between out[i, j] and out[i + 1, j] is ceil(out_length / 4) * 8
 * @param weight: where the weight is put
 * @param bias: where the bias is put
 * @param length: param shown above
 * @param output_right_shift: the right shift number shown in the description of this function
 * @param batch: param shown above
 * @return 0 -- succeed
 * @return 1 -- length is 0
 * @return 2 -- batch is 0
 */
uint8_t fc_depth_int16_to_int16_weight_16bit_batch(int16_t *input,
    int16_t *output, int16_t *weight, int64_t *bias, uint32_t length,
    uint8_t output_right_shift, uint32_t batch);

/* @brief fc_depth_int16_to_int16_weight_16bit_bias_shift_batch() - "depth" fully connect operation as below:
          output[k, i] = (input[k, i] * weight[i] + bias[i] << bias_left_shift) >> output_right_shift
          saturation included, i.e. output elements out of range [-32768, 32767] will be set to -32768 or 32767
 * @param input: where the input, a matrix of size batch * in_length, is put
 *               input should be arranged as below, as an example, we set in_length = 14, batch = 6:
 *               in[0, 0], in[0, 1], in[0, 2], ..., in[0, 13], 0, 0
 *               in[1, 0], in[1, 1], in[1, 2], ..., in[1, 13], 0, 0
 *               ...
 *               in[5, 0], in[5, 1], in[5, 2], ..., in[5, 13], 0, 0
 *               zeros are padded here to guarantee the distance between in[i, j] and in[i + 1, j] is ceil(in_length / 4) * 8
 * @param output: where the output, a matrix of size batch * out_length, will be put
 *                output should be arranged as below, as an example, we set out_length = 14, batch = 6:
 *                out[0, 0], out[0, 1], out[0, 2], ..., out[0, 13], 0, 0,
 *                out[1, 0], out[1, 1], out[1, 2], ..., out[1, 13], 0, 0,
 *                ...
 *                out[5, 0], out[5, 1], out[5, 2], ..., out[5, 13], 0, 0,
 *                zeros are padded here to guarantee the distance between out[i, j] and out[i + 1, j] is ceil(out_length / 4) * 8
 * @param weight: where the weight is put
 * @param bias: where the bias is put
 * @param length: param shown above
 * @param output_right_shift: the right shift number shown in the description of this function
 * @param bias_left_shift: the left shift number shown in the description of this function
 * @param batch: param shown above
 * @return 0 -- succeed
 * @return 1 -- length is 0
 * @return 2 -- batch is 0
 */
uint8_t fc_depth_int16_to_int16_weight_16bit_bias_shift_batch(int16_t *input,
    int16_t *output, int16_t *weight, int16_t *bias, uint32_t length,
    uint8_t output_right_shift, uint8_t bias_left_shift, uint32_t batch);

/* @brief conv1d_int8_to_int8_weight_8bit() - 1-dimension conv operation as below:
          output[c, l] = (sum(input[i, l * stride + k * dilation - padding_left] * weight[c, i, k]) + bias[c]) >> output_right_shift
          where the sum is done on index i and k.
          saturation included, i.e. output elements out of range [-128, 127] will be set to -128 or 127
 * @param in: where the input, a 2D-tensor of size [in_channel, in_length], is put
 *            input is arranged in the form below, as an example, we set in_channel = 28, in_length = 10
 *            input[0, 0], input[1, 0], input[2, 0], ..., input[27, 0], 0, 0, 0, 0,
 *            input[0, 1], input[1, 1], ..., input[27, 1], 0, 0, 0, 0, ...,
 *            input[0, 9], ..., input[27, 9], 0, 0, 0, 0
 *            zeros are padded so that the distance between input[i, j] and input[i, j + 1] is ceil(in_channel / 16.0) * 8 bytes
 * @param out: where the output, a 2D-tensor, of size [out_channel, out_length] will be put, output will be arranged in the same form as input's
 *             where out_length = 1 + floor((in_length + padding_left + padding_right - (kernel_length - 1) * dilation - 1) / stride)
 * @param weight: where the weight is put.
 *                if in_channel, out_channel, and group are the same, weight is a 3D-tensor of size [out_channel, 1, kernel_length].
 *                it should be arranged in the form below, as an example, we set kernel_length = 4, in_channel = 14, out_channel = 14
 *                w[0, 0, 0], w[1, 0, 0], ..., w[7, 0, 0], w[0, 0, 1], w[1, 0, 1], ..., w[7, 0, 1], ..., w[0, 0, 3], ..., w[7, 0, 3],
 *                w[8, 0, 0], ..., w[13, 0, 0], 0, 0, w[8, 0, 1], ..., w[13, 0, 1], 0, 0, ..., w[8, 0, 3], ..., w[13, 0, 3], 0, 0
 *                zeros are padded so that the distance between w[i, 0, k] and w[i, 0, k + 1] is always 8 bytes
 *                otherwise, weight is a 3D-tensor of size [out_channel, in_channel / group, kernel_length],
 *                it could be divided by groups, in each group, it is a 3D-tensor of size [out_channel / group, in_channel / group, kernel_length]
 *                weight should be arranged in the form below, as an example, we set kernel_length = 2, in_channel = 4, out_channel = 28, group = 2
 *                (noticing in_channel / group = 2, out_channel / group = 14, so w[0:14, :, :] is for the 1st group, w[14:28, :, :] is for the 2nd group)
 *                w[0, 0, 0], w[1, 0, 0], ..., w[7, 0, 0], w[0, 0, 1], w[1, 0, 1], ..., w[7, 0, 1], ..., w[0, 0, 3], ..., w[7, 0, 3],
 *                w[0, 1, 0], w[1, 1, 0], ..., w[7, 1, 0], w[0, 1, 1], w[1, 1, 1], ..., w[7, 1, 1], ..., w[0, 1, 3], ..., w[7, 1, 3],
 *                w[8, 0, 0], ..., w[13, 0, 0], 0, 0, w[8, 0, 1], ..., w[13, 0, 1], 0, 0, ..., w[8, 0, 3], ..., w[13, 0, 3], 0, 0,
 *                w[8, 1, 0], ..., w[13, 1, 0], 0, 0, w[8, 1, 1], ..., w[13, 1, 1], 0, 0, ..., w[8, 1, 3], ..., w[13, 1, 3], 0, 0,
 *                w[14, 0, 0], w[15, 0, 0], ..., w[21, 0, 0], w[14, 0, 1], w[15, 0, 1], ..., w[21, 0, 1], ..., w[14, 0, 3], ..., w[21, 0, 3],
 *                w[14, 1, 0], w[15, 1, 0], ..., w[21, 1, 0], w[14, 1, 1], w[15, 1, 1], ..., w[21, 1, 1], ..., w[14, 1, 3], ..., w[21, 1, 3],
 *                w[22, 0, 0], ..., w[27, 0, 0], 0, 0, 0, 0, w[22, 0, 1], ..., w[27, 0, 1], 0, 0, 0, 0, ..., w[22, 0, 3], ..., w[27, 0, 3], 0, 0, 0, 0
 *                w[22, 1, 0], ..., w[27, 1, 0], 0, 0, 0, 0, w[22, 1, 1], ..., w[27, 1, 1], 0, 0, 0, 0, ..., w[22, 1, 3], ..., w[27, 1, 3], 0, 0, 0, 0
 *                zeros are padded here to guarantee the distance between w[i, j, k] and w[i, j, k + 1] in memory is 8 bytes
 * @param bias: where the bias is put, if bias is not needed, this pointer should be set to NULL
 * @param bean: pointer to the config, whose description is shown in the beginning
 * @param output_right_shift: the right shift number shown in the description of this function
 * @return 0 -- succeed
 * @return 1 -- in_channel is 0
 * @return 2 -- out_channel is 0
 * @return 3 -- in_length is 0
 * @return 4 -- kernel_length is 0
 * @return 5 -- stride is 0
 * @return 6 -- dilation is 0
 * @return 7 -- group is 0
 * @return 8 -- in_channel % group is not 0
 * @return 9 -- out_channel % group is not 0
 * @return 10 -- after calculation, output_length is not a positive number
 */
uint8_t conv1d_int8_to_int8_weight_8bit(int8_t *in, int8_t *out,
    int8_t *weight, int32_t *bias, struct conv1d_config_bean *bean,
    uint8_t output_right_shift);

/* @brief conv1d_int8_to_int8_weight_8bit() - 1-dimension conv operation as below:
          output[c, l] = (sum(input[i, l * stride + k * dilation - padding_left] * weight[c, i, k]) + bias[c] << bias_left_shift) >> output_right_shift
          where the sum is done on index i and k.
          saturation included, i.e. output elements out of range [-128, 127] will be set to -128 or 127
 * @param in: where the input, a 2D-tensor of size [in_channel, in_length], is put
 *            input is arranged in the form below, as an example, we set in_channel = 28, in_length = 10
 *            input[0, 0], input[1, 0], input[2, 0], ..., input[27, 0], 0, 0, 0, 0,
 *            input[0, 1], input[1, 1], ..., input[27, 1], 0, 0, 0, 0, ...,
 *            input[0, 9], ..., input[27, 9], 0, 0, 0, 0
 *            zeros are padded so that the distance between input[i, j] and input[i, j + 1] is ceil(in_channel / 16.0) * 8 bytes
 * @param out: where the output, a 2D-tensor, of size [out_channel, out_length] will be put, output will be arranged in the same form as input's
 *             where out_length = 1 + floor((in_length + padding_left + padding_right - (kernel_length - 1) * dilation - 1) / stride)
 * @param weight: where the weight is put.
 *                if in_channel, out_channel, and group are the same, weight is a 3D-tensor of size [out_channel, 1, kernel_length].
 *                it should be arranged in the form below, as an example, we set kernel_length = 4, in_channel = 14, out_channel = 14
 *                w[0, 0, 0], w[1, 0, 0], ..., w[7, 0, 0], w[0, 0, 1], w[1, 0, 1], ..., w[7, 0, 1], ..., w[0, 0, 3], ..., w[7, 0, 3],
 *                w[8, 0, 0], ..., w[13, 0, 0], 0, 0, w[8, 0, 1], ..., w[13, 0, 1], 0, 0, ..., w[8, 0, 3], ..., w[13, 0, 3], 0, 0
 *                zeros are padded so that the distance between w[i, 0, k] and w[i, 0, k + 1] is always 8 bytes
 *                otherwise, weight is a 3D-tensor of size [out_channel, in_channel / group, kernel_length],
 *                it could be divided by groups, in each group, it is a 3D-tensor of size [out_channel / group, in_channel / group, kernel_length]
 *                weight should be arranged in the form below, as an example, we set kernel_length = 2, in_channel = 4, out_channel = 28, group = 2
 *                (noticing in_channel / group = 2, out_channel / group = 14, so w[0:14, :, :] is for the 1st group, w[14:28, :, :] is for the 2nd group)
 *                w[0, 0, 0], w[1, 0, 0], ..., w[7, 0, 0], w[0, 0, 1], w[1, 0, 1], ..., w[7, 0, 1], ..., w[0, 0, 3], ..., w[7, 0, 3],
 *                w[0, 1, 0], w[1, 1, 0], ..., w[7, 1, 0], w[0, 1, 1], w[1, 1, 1], ..., w[7, 1, 1], ..., w[0, 1, 3], ..., w[7, 1, 3],
 *                w[8, 0, 0], ..., w[13, 0, 0], 0, 0, w[8, 0, 1], ..., w[13, 0, 1], 0, 0, ..., w[8, 0, 3], ..., w[13, 0, 3], 0, 0,
 *                w[8, 1, 0], ..., w[13, 1, 0], 0, 0, w[8, 1, 1], ..., w[13, 1, 1], 0, 0, ..., w[8, 1, 3], ..., w[13, 1, 3], 0, 0,
 *                w[14, 0, 0], w[15, 0, 0], ..., w[21, 0, 0], w[14, 0, 1], w[15, 0, 1], ..., w[21, 0, 1], ..., w[14, 0, 3], ..., w[21, 0, 3],
 *                w[14, 1, 0], w[15, 1, 0], ..., w[21, 1, 0], w[14, 1, 1], w[15, 1, 1], ..., w[21, 1, 1], ..., w[14, 1, 3], ..., w[21, 1, 3],
 *                w[22, 0, 0], ..., w[27, 0, 0], 0, 0, 0, 0, w[22, 0, 1], ..., w[27, 0, 1], 0, 0, 0, 0, ..., w[22, 0, 3], ..., w[27, 0, 3], 0, 0, 0, 0
 *                w[22, 1, 0], ..., w[27, 1, 0], 0, 0, 0, 0, w[22, 1, 1], ..., w[27, 1, 1], 0, 0, 0, 0, ..., w[22, 1, 3], ..., w[27, 1, 3], 0, 0, 0, 0
 *                zeros are padded here to guarantee the distance between w[i, j, k] and w[i, j, k + 1] in memory is 8 bytes
 * @param bias: where the bias is put, if bias is not needed, this pointer should be set to NULL
 * @param bean: pointer to the config, whose description is shown in the beginning
 * @param output_right_shift: the right shift number shown in the description of this function
 * @param bias_left_shift: the left shift number shown in the description of this function
 * @return 0 -- succeed
 * @return 1 -- in_channel is 0
 * @return 2 -- out_channel is 0
 * @return 3 -- in_length is 0
 * @return 4 -- kernel_length is 0
 * @return 5 -- stride is 0
 * @return 6 -- dilation is 0
 * @return 7 -- group is 0
 * @return 8 -- in_channel % group is not 0
 * @return 9 -- out_channel % group is not 0
 * @return 10 -- after calculation, output_length is not a positive number
 */
uint8_t conv1d_int8_to_int8_weight_8bit_bias_shift(int8_t *in, int8_t *out,
    int8_t *weight, int8_t *bias, struct conv1d_config_bean *bean,
    uint8_t output_right_shift, uint8_t bias_left_shift);

/* @brief conv1d_int16_to_int16_weight_16bit() - 1-dimension conv operation as below:
          output[c, l] = (sum(input[i, l * stride + k * dilation - padding_left] * weight[c, i, k]) + bias[c]) >> output_right_shift
          where the sum is done on index i and k.
          saturation included, i.e. output elements out of range [-32768, 32767] will be set to -32768 or 32767
 * @param in: where the input, a 2D-tensor of size [in_channel, in_length], is put
 *            input is arranged in the form below, as an example, we set in_channel = 14, in_length = 10
 *            input[0, 0], input[1, 0], input[2, 0], ..., input[13, 0], 0, 0,
 *            input[0, 1], input[1, 1], ..., input[13, 1], 0, 0, ...,
 *            input[0, 9], ..., input[13, 9], 0, 0
 *            zeros are padded so that the distance between input[i, j] and input[i, j + 1] is ceil(in_channel / 8.0) * 8 bytes
 * @param out: where the output, a 2D-tensor, of size [out_channel, out_length] will be put, output will be arranged in the same form as input's
 *             where out_length = 1 + floor((in_length + padding_left + padding_right - (kernel_length - 1) * dilation - 1) / stride)
 * @param weight: where the weight is put.
 *                if in_channel, out_channel, and group are the same, weight is a 3D-tensor of size [out_channel, 1, kernel_length].
 *                it should be arranged in the form below, as an example, we set kernel_length = 4, in_channel = 7, out_channel = 7
 *                w[0, 0, 0], w[1, 0, 0], w[2, 0, 0], w[3, 0, 0], w[0, 0, 1], w[1, 0, 1], w[2, 0, 1], w[3, 0, 1], ..., w[0, 0, 3], ..., w[3, 0, 3],
 *                w[4, 0, 0], w[5, 0, 0], w[6, 0, 0], 0, w[4, 0, 1], w[5, 0, 1], w[6, 0, 1], 0, ..., w[4, 0, 3], w[5, 0, 3], w[6, 0, 3], 0
 *                zeros are padded so that the distance between w[i, 0, k] and w[i, 0, k + 1] is always 8 bytes
 *                otherwise, weight is a 3D-tensor of size [out_channel, in_channel / group, kernel_length],
 *                it could be divided by groups, in each group, it is a 3D-tensor of size [out_channel / group, in_channel / group, kernel_length]
 *                weight should be arranged in the form below, as an example, we set kernel_length = 2, in_channel = 4, out_channel = 14, group = 2
 *                (noticing in_channel / group = 2, out_channel / group = 7, so w[0:7, :, :] is for the 1st group, w[7:14, :, :] is for the 2nd group)
 *                w[0, 0, 0], w[1, 0, 0], w[2, 0, 0], w[3, 0, 0], w[0, 0, 1], w[1, 0, 1], w[2, 0, 1], w[3, 0, 1], w[2, 0, 2], w[0, 0, 3], ..., w[3, 0, 3],
 *                w[0, 1, 0], w[1, 1, 0], w[2, 1, 0], w[3, 1, 0], w[0, 1, 1], w[1, 1, 1], w[2, 1, 1], w[3, 1, 1], w[2, 1, 2], w[0, 1, 3], ..., w[3, 1, 3],
 *                w[4, 0, 0], w[5, 0, 0], w[6, 0, 0], 0, w[4, 0, 1], w[5, 0, 1], w[6, 0, 1], 0, ..., w[4, 0, 3], w[5, 0, 1], w[6, 0, 3], 0
 *                w[4, 1, 0], w[5, 1, 0], w[6, 1, 0], 0, w[4, 1, 1], w[5, 1, 1], w[6, 1, 1], 0, ..., w[4, 1, 3], w[5, 1, 1], w[6, 1, 3], 0
 *                w[7, 0, 0], w[8, 0, 0], ..., w[10, 0, 0], w[7, 0, 1], w[8, 0, 1], ..., w[10, 0, 1], ..., w[7, 0, 3], ..., w[10, 0, 3],
 *                w[7, 1, 0], w[8, 1, 0], ..., w[10, 1, 0], w[7, 1, 1], w[8, 1, 1], ..., w[10, 1, 1], ..., w[7, 1, 3], ..., w[10, 1, 3],
 *                w[11, 0, 0], ..., w[13, 0, 0], 0, 0, w[11, 0, 1], ..., w[13, 0, 1], 0, 0, ..., w[11, 0, 3], ..., w[13, 0, 3], 0, 0
 *                w[11, 1, 0], ..., w[13, 1, 0], 0, 0, w[11, 1, 1], ..., w[13, 1, 1], 0, 0, ..., w[11, 1, 3], ..., w[13, 1, 3], 0, 0
 *                zeros are padded here to guarantee the distance between w[i, j, k] and w[i, j, k + 1] in memory is 8 bytes
 * @param bias: where the bias is put, if bias is not needed, this pointer should be set to NULL
 * @param bean: pointer to the config, whose description is shown in the beginning
 * @param output_right_shift: the right shift number shown in the description of this function
 * @return 0 -- succeed
 * @return 1 -- in_channel is 0
 * @return 2 -- out_channel is 0
 * @return 3 -- in_length is 0
 * @return 4 -- kernel_length is 0
 * @return 5 -- stride is 0
 * @return 6 -- dilation is 0
 * @return 7 -- group is 0
 * @return 8 -- in_channel % group is not 0
 * @return 9 -- out_channel % group is not 0
 * @return 10 -- after calculation, output_length is not a positive number
 */
uint8_t conv1d_int16_to_int16_weight_16bit(int16_t *in, int16_t *out,
    int16_t *weight, int64_t *bias, struct conv1d_config_bean *bean,
    uint8_t output_right_shift);

/* @brief conv1d_int16_to_int16_weight_16bit_bias_shift() - 1-dimension conv operation as below:
          output[c, l] = (sum(input[i, l * stride + k * dilation - padding_left] * weight[c, i, k]) + bias[c] << bias_left_shift) >> output_right_shift
          where the sum is done on index i and k.
          saturation included, i.e. output elements out of range [-32768, 32767] will be set to -32768 or 32767
 * @param in: where the input, a 2D-tensor of size [in_channel, in_length], is put
 *            input is arranged in the form below, as an example, we set in_channel = 14, in_length = 10
 *            input[0, 0], input[1, 0], input[2, 0], ..., input[13, 0], 0, 0,
 *            input[0, 1], input[1, 1], ..., input[13, 1], 0, 0, ...,
 *            input[0, 9], ..., input[13, 9], 0, 0
 *            zeros are padded so that the distance between input[i, j] and input[i, j + 1] is ceil(in_channel / 8.0) * 8 bytes
 * @param out: where the output, a 2D-tensor, of size [out_channel, out_length] will be put, output will be arranged in the same form as input's
 *             where out_length = 1 + floor((in_length + padding_left + padding_right - (kernel_length - 1) * dilation - 1) / stride)
 * @param weight: where the weight is put.
 *                if in_channel, out_channel, and group are the same, weight is a 3D-tensor of size [out_channel, 1, kernel_length].
 *                it should be arranged in the form below, as an example, we set kernel_length = 4, in_channel = 7, out_channel = 7
 *                w[0, 0, 0], w[1, 0, 0], w[2, 0, 0], w[3, 0, 0], w[0, 0, 1], w[1, 0, 1], w[2, 0, 1], w[3, 0, 1], ..., w[0, 0, 3], ..., w[3, 0, 3],
 *                w[4, 0, 0], w[5, 0, 0], w[6, 0, 0], 0, w[4, 0, 1], w[5, 0, 1], w[6, 0, 1], 0, ..., w[4, 0, 3], w[5, 0, 3], w[6, 0, 3], 0
 *                zeros are padded so that the distance between w[i, 0, k] and w[i, 0, k + 1] is always 8 bytes
 *                otherwise, weight is a 3D-tensor of size [out_channel, in_channel / group, kernel_length],
 *                it could be divided by groups, in each group, it is a 3D-tensor of size [out_channel / group, in_channel / group, kernel_length]
 *                weight should be arranged in the form below, as an example, we set kernel_length = 2, in_channel = 4, out_channel = 14, group = 2
 *                (noticing in_channel / group = 2, out_channel / group = 7, so w[0:7, :, :] is for the 1st group, w[7:14, :, :] is for the 2nd group)
 *                w[0, 0, 0], w[1, 0, 0], w[2, 0, 0], w[3, 0, 0], w[0, 0, 1], w[1, 0, 1], w[2, 0, 1], w[3, 0, 1], w[2, 0, 2], w[0, 0, 3], ..., w[3, 0, 3],
 *                w[0, 1, 0], w[1, 1, 0], w[2, 1, 0], w[3, 1, 0], w[0, 1, 1], w[1, 1, 1], w[2, 1, 1], w[3, 1, 1], w[2, 1, 2], w[0, 1, 3], ..., w[3, 1, 3],
 *                w[4, 0, 0], w[5, 0, 0], w[6, 0, 0], 0, w[4, 0, 1], w[5, 0, 1], w[6, 0, 1], 0, ..., w[4, 0, 3], w[5, 0, 1], w[6, 0, 3], 0
 *                w[4, 1, 0], w[5, 1, 0], w[6, 1, 0], 0, w[4, 1, 1], w[5, 1, 1], w[6, 1, 1], 0, ..., w[4, 1, 3], w[5, 1, 1], w[6, 1, 3], 0
 *                w[7, 0, 0], w[8, 0, 0], ..., w[10, 0, 0], w[7, 0, 1], w[8, 0, 1], ..., w[10, 0, 1], ..., w[7, 0, 3], ..., w[10, 0, 3],
 *                w[7, 1, 0], w[8, 1, 0], ..., w[10, 1, 0], w[7, 1, 1], w[8, 1, 1], ..., w[10, 1, 1], ..., w[7, 1, 3], ..., w[10, 1, 3],
 *                w[11, 0, 0], ..., w[13, 0, 0], 0, 0, w[11, 0, 1], ..., w[13, 0, 1], 0, 0, ..., w[11, 0, 3], ..., w[13, 0, 3], 0, 0
 *                w[11, 1, 0], ..., w[13, 1, 0], 0, 0, w[11, 1, 1], ..., w[13, 1, 1], 0, 0, ..., w[11, 1, 3], ..., w[13, 1, 3], 0, 0
 *                zeros are padded here to guarantee the distance between w[i, j, k] and w[i, j, k + 1] in memory is 8 bytes
 * @param bias: where the bias is put, if bias is not needed, this pointer should be set to NULL
 * @param bean: pointer to the config, whose description is shown in the beginning
 * @param output_right_shift: the right shift number shown in the description of this function
 * @param bias_left_shift: the left shift number shown in the description of this function
 * @return 0 -- succeed
 * @return 1 -- in_channel is 0
 * @return 2 -- out_channel is 0
 * @return 3 -- in_length is 0
 * @return 4 -- kernel_length is 0
 * @return 5 -- stride is 0
 * @return 6 -- dilation is 0
 * @return 7 -- group is 0
 * @return 8 -- in_channel % group is not 0
 * @return 9 -- out_channel % group is not 0
 * @return 10 -- after calculation, output_length is not a positive number
 */
uint8_t conv1d_int16_to_int16_weight_16bit_bias_shift(int16_t *in, int16_t *out,
    int16_t *weight, int16_t *bias, struct conv1d_config_bean *bean,
    uint8_t output_right_shift, uint8_t bias_left_shift);

/* @brief sigmoid_int8() - sigmoid function
 * @param input: pointer to the input vector, input should be of form s(8, 4), i.e. signed number with fraction of 4
 * @param output: pointer to the output vector, output will be of form s(8, 7), i.e. signed number with fraction of 7
 */
void sigmoid_int8(int8_t *input_, int8_t *output, uint32_t length);

/* @brief sigmoid_int16() - sigmoid function
 * @param input: pointer to the input vector, input should be of form s(16, 12), i.e. signed number with fraction of 12
 * @param output: pointer to the output vector, output will be of form s(16, 15), i.e. signed number with fraction of 15
 */
void sigmoid_int16(int16_t *input_, int16_t *output, uint32_t length);

/* @brief tanh_int8() - tanh function
 * @param input: pointer to the input vector, input should be of form s(8, 5), i.e. signed number with fraction of 5
 * @param output: pointer to the output vector, output will be of form s(8, 7), i.e. signed number with fraction of 7
 */
void tanh_int8(int8_t *input_, int8_t *output, uint32_t length);

/* @brief tanh_int16() - tanh function
 * @param input: pointer to the input vector, input should be of form s(16, 13), i.e. signed number with fraction of 13
 * @param output: pointer to the output vector, output will be of form s(16, 15), i.e. signed number with fraction of 15
 */
void tanh_int16(int16_t *input_, int16_t *output, uint32_t length);

/* @brief softmax_int8() - softmax function
 * @param input: pointer to the input vector, input should be of form s(8, 4), i.e. signed number with fraction of 4
 * @param output: pointer to the output vector, output will be of form s(8, 7), i.e. signed number with fraction of 7
 */
void softmax_int8(int8_t *input_, int8_t *output, uint16_t length);

/* @brief softmax_int16() - softmax function
 * @param input: pointer to the input vector, input should be of form s(16, 12), i.e. signed number with fraction of 12
 * @param output: pointer to the output vector, output will be of form s(16, 15), i.e. signed number with fraction of 15
 */
void softmax_int16(int16_t *input_, int16_t *output, uint16_t length);

/* @brief logsoftmax_int8() - logsoftmax function, formular as below:
          output[b, i] = input_[b, i] - ln(sum(exp(input_[b, j]))), where the sum is done over j
          input and output vector should be of form s(8, 5), i.e. signed number with fraction of 5
 * @param input_: where the input_, a matrix of size batch * length, is put
 *               input should be arranged as below, as an example, we set length = 14, batch = 6:
 *               in[0, 0], in[0, 1], in[0, 2], ..., in[0, 13], 0, 0
 *               in[1, 0], in[1, 1], in[1, 2], ..., in[1, 13], 0, 0
 *               ...
 *               in[5, 0], in[5, 1], in[5, 2], ..., in[5, 13], 0, 0
 *               zeros are padded here to guarantee the distance between in[i, j] and in[i + 1, j] is ceil(in_length / 8) * 8
 * @param output: where the output, a matrix of size batch * length, will be put
 *                output should be arranged as below, as an example, we set length = 14, batch = 6:
 *                out[0, 0], out[0, 1], out[0, 2], ..., out[0, 13], 0, 0,
 *                out[1, 0], out[1, 1], out[1, 2], ..., out[1, 13], 0, 0,
 *                ...
 *                out[5, 0], out[5, 1], out[5, 2], ..., out[5, 13], 0, 0,
 *                zeros are padded here to guarantee the distance between out[i, j] and out[i + 1, j] is ceil(out_length / 8) * 8
 * @param max_indice: saving the max_indice of each batch of output
 * @param max: saving the maximum value of each batch of output
 * @param length: length of each input vector
 * @param batch: batch of input vectors
 * @return 0 -- succeed
 * @return 1 -- length < 2
 * @return 2 -- batch is 0
 * @return 3 -- batch is not a multiple of 8
 */
uint8_t logsoftmax_int8(int8_t *input_, int8_t *output, uint16_t *max_indice, int8_t *max, uint16_t length, uint32_t batch);

/* @brief logsoftmax_int16() - logsoftmax function, formular as below:
          output[b, i] = input_[b, i] - ln(sum(exp(input_[b, j]))), where the sum is done over j
          input and output vector should be of form s(16, 12), i.e. signed number with fraction of 12
* @param input_: where the input_, a matrix of size batch * length, is put
 *               input should be arranged as below, as an example, we set length = 14, batch = 6:
 *               in[0, 0], in[0, 1], in[0, 2], ..., in[0, 13], 0, 0
 *               in[1, 0], in[1, 1], in[1, 2], ..., in[1, 13], 0, 0
 *               ...
 *               in[5, 0], in[5, 1], in[5, 2], ..., in[5, 13], 0, 0
 *               zeros are padded here to guarantee the distance between in[i, j] and in[i + 1, j] is ceil(in_length / 4) * 8
 * @param output: where the output, a matrix of size batch * length, will be put
 *                output should be arranged as below, as an example, we set length = 14, batch = 6:
 *                out[0, 0], out[0, 1], out[0, 2], ..., out[0, 13], 0, 0,
 *                out[1, 0], out[1, 1], out[1, 2], ..., out[1, 13], 0, 0,
 *                ...
 *                out[5, 0], out[5, 1], out[5, 2], ..., out[5, 13], 0, 0,
 *                zeros are padded here to guarantee the distance between out[i, j] and out[i + 1, j] is ceil(out_length / 4) * 8
 * @param max_indice: saving the max_indice of each batch of output
 * @param max: saving the maximum value of each batch of output
 * @param length: length of each input vector
 * @param batch: batch of input vectors
 * @return 0 -- succeed
 * @return 1 -- length < 2
 * @return 2 -- batch is 0
 * @return 3 -- batch is not a multiple of 4
 */
uint8_t logsoftmax_int16(int16_t *input_, int16_t *output, uint16_t *max_indice, int16_t *max, uint16_t length, uint32_t batch);

/**
  * @}
  */

#ifdef __cplusplus
}
#endif

#endif