kunlun/dtest/bee_ai_test/nn_function_128.h

/****************************************************************************

Copyright(c) 2019 by Aerospace C.Power (Chongqing) Microelectronics. ALL RIGHTS RESERVED.

This Information is proprietary to Aerospace C.Power (Chongqing) Microelectronics and MAY NOT
be copied by any method or incorporated into another program without
the express written consent of Aerospace C.Power. This Information or any portion
thereof remains the property of Aerospace C.Power. The Information contained herein
is believed to be accurate and Aerospace C.Power assumes no responsibility or
liability for its use in any way and conveys no license or title under
any patent or copyright and makes no representation or warranty that this
Information is free from patent or copyright infringement.

****************************************************************************/
#ifndef NN_FUNCTION_H
#define NN_FUNCTION_H
/* os shim includes */
#include "os_types.h"

#ifdef __cplusplus
extern "C" {
#endif

/** \defgroup linear nn API
  * @brief linear nn APIs
  * Linear neural network functions, including FC(fully connect) and 1D conv
  */

/** @addtogroup linear_nn_APIs
  * @{
  */

struct conv1d_config_bean {
    uint32_t in_channel; // number of channel of input
    uint32_t in_length; // length of input
    uint32_t out_channel; // number of channel of output
    uint32_t out_length; // length of output, which will be set after conv calculation is completed
    uint32_t kernel_length; // length of conv kernel
    uint32_t stride; // stride of conv
    uint32_t dilation; // dilation of conv
    uint32_t padding_left; // padding on the left
    uint32_t padding_right; // padding on the right
    uint32_t group; // group of the conv
};

/* @brief fc_int8_to_int8_weight_8bit() - fully connect operation as below:
          output[i] = (sum(input[j] * weight[i, j]) + bias[i]) >> output_right_shift
          where the sum is done on index j.
          saturation included, i.e. output elements out of range [-128, 127] will be set to -128 or 127
 * @param input: where the input, a vector, is put
 * @param output: where the output, a vector, will be put
 * @param weight: where the weight, a matrix of size out_length * in_length, is put
 *                weight should be arranged in the form below, as an example, we set in_length = 4, out_length = 20
 *                w[0, 0], w[1, 0], w[2, 0], ..., w[15, 0], w[0, 1], w[1, 1], w[2, 1], ..., w[15, 1], ..., w[0, 3], ..., w[15, 3],
 *                w[16, 0], w[17, 0], w[18, 0], w[19, 0], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
 *                w[16, 1], w[17, 1], w[18, 1], w[19, 1], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
 *                w[16, 2], w[17, 2], w[18, 2], w[19, 2], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
 *                w[16, 3], w[17, 3], w[18, 3], w[19, 3], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
 *                zeros are padded here to guarantee the distance between w[i, j] and w[i, j+1] in memory is 16 bytes
 * @param bias: where the bias is put
 * @param in_length: length of input vector
 * @param out_length: length of output vector
 * @param output_right_shift: the right shift number shown in the description of this function
 * @return 0 -- succeed
 * @return 1 -- in_length is 0
 * @return 2 -- out_length is 0
 */
uint8_t fc_int8_to_int8_weight_8bit(int8_t *input, int8_t *output, 
    int8_t *weight, int32_t *bias, uint32_t in_length, uint32_t out_length, 
    uint8_t output_right_shift);

/* @brief fc_int8_to_int8_weight_8bit_bias_shift() - fully connect operation as below:
          output[i] = (sum(input[j] * weight[i, j]) + bias[i] << bias_left_shift) >> output_right_shift
          where the sum is done on index j.
          saturation included, i.e. output elements out of range [-128, 127] will be set to -128 or 127
 * @param input: where the input, a vector, is put
 * @param output: where the output, a vector, will be put
 * @param weight: where the weight, a matrix of size out_length * in_length, is put
 *                weight should be arranged in the form below, as an example, we set in_length = 4, out_length = 20
 *                w[0, 0], w[1, 0], w[2, 0], ..., w[15, 0], w[0, 1], w[1, 1], w[2, 1], ..., w[15, 1], ..., w[0, 3], ..., w[15, 3],
 *                w[16, 0], w[17, 0], w[18, 0], w[19, 0], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
 *                w[16, 1], w[17, 1], w[18, 1], w[19, 1], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
 *                w[16, 2], w[17, 2], w[18, 2], w[19, 2], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
 *                w[16, 3], w[17, 3], w[18, 3], w[19, 3], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
 *                zeros are padded here to guarantee the distance between w[i, j] and w[i, j+1] in memory is 16 bytes
 * @param bias: where the bias is put
 * @param in_length: length of input vector
 * @param out_length: length of output vector
 * @param output_right_shift: the right shift number shown in the description of this function
 * @param bias_left_shift: the left shift number shown in the description of this function
 * @return 0 -- succeed
 * @return 1 -- in_length is 0
 * @return 2 -- out_length is 0
 */
uint8_t fc_int8_to_int8_weight_8bit_bias_shift(int8_t *input, int8_t *output, 
    int8_t *weight, int8_t *bias, uint32_t in_length, uint32_t out_length, 
    uint8_t output_right_shift, uint8_t bias_left_shift);

/* @brief fc_int16_to_int16_weight_16bit() - fully connect operation as below:
          output[i] = (sum(input[j] * weight[i, j]) + bias[i]) >> output_right_shift
          where the sum is done on index j.
          saturation included, i.e. output elements out of range [-32768, 32767] will be set to -32768 or 32767
 * @param input: where the input, a vector, is put
 * @param output: where the output, a vector, will be put
 * @param weight: where the weight, a matrix of size out_length * in_length, is put
 *                weight should be arranged in the form below, as an example, we set in_length = 4, out_length = 10
 *                w[0, 0], w[1, 0], w[2, 0], ..., w[7, 0], w[0, 1], w[1, 1], w[2, 1], ..., w[7, 1], ..., w[0, 3], ..., w[7, 3],
 *                w[8, 0], w[9, 0], 0, 0, 0, 0, 0, 0, 
 *                w[8, 1], w[9, 1], 0, 0, 0, 0, 0, 0, 
 *                w[8, 2], w[9, 2], 0, 0, 0, 0, 0, 0, 
 *                w[8, 3], w[9, 3], 0, 0, 0, 0, 0, 0, 
 *                zeros are padded here to guarantee the distance between w[i, j] and w[i, j+1] in memory is 16 bytes
 * @param bias: where the bias is put
 * @param in_length: length of input vector
 * @param out_length: length of output vector
 * @param output_right_shift: the right shift number shown in the description of this function
 * @param bias_left_shift: the left shift number shown in the description of this function
 * @return 0 -- succeed
 * @return 1 -- in_length is 0
 * @return 2 -- out_length is 0
 */
uint8_t fc_int16_to_int16_weight_16bit(int16_t *input, int16_t *output, 
    int16_t *weight, int64_t *bias, uint32_t in_length, uint32_t out_length, 
    uint8_t output_right_shift);

/* @brief fc_int16_to_int16_weight_16bit_bias_shift() - fully connect operation as below:
          output[i] = (sum(input[j] * weight[i, j]) + bias[i] << bias_left_shift) >> output_right_shift
          where the sum is done on index j.
          saturation included, i.e. output elements out of range [-32768, 32767] will be set to -32768 or 32767
 * @param input: where the input, a vector, is put
 * @param output: where the output, a vector, will be put
 * @param weight: where the weight, a matrix of size out_length * in_length, is put
 *                weight should be arranged in the form below, as an example, we set in_length = 4, out_length = 20
 *                w[0, 0], w[1, 0], w[2, 0], ..., w[7, 0], w[0, 1], w[1, 1], w[2, 1], ..., w[7, 1], ..., w[0, 3], ..., w[7, 3],
 *                w[8, 0], w[9, 0], 0, 0, 0, 0, 0, 0, 
 *                w[8, 1], w[9, 1], 0, 0, 0, 0, 0, 0, 
 *                w[8, 2], w[9, 2], 0, 0, 0, 0, 0, 0, 
 *                w[8, 3], w[9, 3], 0, 0, 0, 0, 0, 0, 
 *                zeros are padded here to guarantee the distance between w[i, j] and w[i, j+1] in memory is 16 bytes
 * @param bias: where the bias is put
 * @param in_length: length of input vector
 * @param out_length: length of output vector
 * @param output_right_shift: the right shift number shown in the description of this function
 * @param bias_left_shift: the left shift number shown in the description of this function
 * @return 0 -- succeed
 * @return 1 -- in_length is 0
 * @return 2 -- out_length is 0
 */
uint8_t fc_int16_to_int16_weight_16bit_bias_shift(int16_t *input, int16_t *output, 
    int16_t *weight, int16_t *bias, uint32_t in_length, uint32_t out_length, 
    uint8_t output_right_shift, uint8_t bias_left_shift);

/* @brief conv1d_int8_to_int8_weight_8bit() - 1-dimension conv operation as below:
          output[c, l] = (sum(input[i, l * stride + k * dilation - padding_left] * weight[c, i, k]) + bias[c]) >> output_right_shift
          where the sum is done on index i and k.
          saturation included, i.e. output elements out of range [-128, 127] will be set to -128 or 127
 * @param in: where the input, a 2D-tensor of size [in_channel, in_length], is put
 *            input is arranged in the form below, as an example, we set in_channel = 28, in_length = 10
 *            input[0, 0], input[1, 0], input[2, 0], ..., input[27, 0], 0, 0, 0, 0, 
 *            input[0, 1], input[1, 1], ..., input[27, 1], 0, 0, 0, 0, ..., 
 *            input[0, 9], ..., input[27, 9], 0, 0, 0, 0
 *            zeros are padded so that the distance between input[i, j] and input[i, j + 1] is ceil(in_channel / 16.0) * 16 bytes
 * @param out: where the output, a 2D-tensor, of size [out_channel, out_length] will be put, output will be arranged in the same form as input's
 *             where out_length = 1 + floor((in_length + padding_left + padding_right - (kernel_length - 1) * dilation - 1) / stride)
 * @param weight: where the weight is put.
 *                if in_channel, out_channel, and group are the same, weight is a 3D-tensor of size [out_channel, 1, kernel_length].
 *                it should be arranged in the form below, as an example, we set kernel_length = 4, in_channel = 28, out_channel = 28
 *                w[0, 0, 0], w[1, 0, 0], ..., w[15, 0, 0], w[0, 0, 1], w[1, 0, 1], ..., w[15, 0, 1], ..., w[0, 0, 3], ..., w[15, 0, 3],
 *                w[16, 0, 0], ..., w[27, 0, 0], 0, 0, 0, 0, w[16, 0, 1], ..., w[27, 0, 1], 0, 0, 0, 0, ..., w[16, 0, 3], ..., w[27, 0, 3], 0, 0, 0, 0
 *                zeros are padded so that the distance between w[i, 0, k] and w[i, 0, k + 1] is always 16 bytes
 *                otherwise, weight is a 3D-tensor of size [out_channel, in_channel / group, kernel_length], 
 *                it could be divided by groups, in each group, it is a 3D-tensor of size [out_channel / group, in_channel / group, kernel_length]
 *                weight should be arranged in the form below, as an example, we set kernel_length = 2, in_channel = 4, out_channel = 56, group = 2
 *                (noticing in_channel / group = 2, out_channel / group = 28, so w[0:28, :, :] is for the 1st group, w[28:55, :, :] is for the 2nd group)
 *                w[0, 0, 0], w[1, 0, 0], ..., w[15, 0, 0], w[0, 0, 1], w[1, 0, 1], ..., w[15, 0, 1], ..., w[0, 0, 3], ..., w[15, 0, 3],
 *                w[0, 1, 0], w[1, 1, 0], ..., w[15, 1, 0], w[0, 1, 1], w[1, 1, 1], ..., w[15, 1, 1], ..., w[0, 1, 3], ..., w[15, 1, 3],
 *                w[16, 0, 0], ..., w[27, 0, 0], 0, 0, 0, 0, w[16, 0, 1], ..., w[27, 0, 1], 0, 0, 0, 0, ..., w[16, 0, 3], ..., w[27, 0, 3], 0, 0, 0, 0
 *                w[16, 1, 0], ..., w[27, 1, 0], 0, 0, 0, 0, w[16, 1, 1], ..., w[27, 1, 1], 0, 0, 0, 0, ..., w[16, 1, 3], ..., w[27, 1, 3], 0, 0, 0, 0
 *                w[28, 0, 0], w[29, 0, 0], ..., w[43, 0, 0], w[28, 0, 1], w[29, 0, 1], ..., w[43, 0, 1], ..., w[28, 0, 3], ..., w[43, 0, 3],
 *                w[28, 1, 0], w[29, 1, 0], ..., w[43, 1, 0], w[28, 1, 1], w[29, 1, 1], ..., w[43, 1, 1], ..., w[28, 1, 3], ..., w[43, 1, 3],
 *                w[44, 0, 0], ..., w[55, 0, 0], 0, 0, 0, 0, w[44, 0, 1], ..., w[55, 0, 1], 0, 0, 0, 0, ..., w[44, 0, 3], ..., w[55, 0, 3], 0, 0, 0, 0
 *                w[44, 1, 0], ..., w[55, 1, 0], 0, 0, 0, 0, w[44, 1, 1], ..., w[55, 1, 1], 0, 0, 0, 0, ..., w[44, 1, 3], ..., w[55, 1, 3], 0, 0, 0, 0
 *                zeros are padded here to guarantee the distance between w[i, j, k] and w[i, j, k + 1] in memory is 16 bytes
 * @param bias: where the bias is put, if bias is not needed, this pointer should be set to NULL
 * @param bean: pointer to the config, whose description is shown in the beginning
 * @param output_right_shift: the right shift number shown in the description of this function
 * @return 0 -- succeed
 * @return 1 -- in_channel is 0
 * @return 2 -- out_channel is 0
 * @return 3 -- in_length is 0
 * @return 4 -- kernel_length is 0
 * @return 5 -- stride is 0
 * @return 6 -- dilation is 0
 * @return 7 -- group is 0
 * @return 8 -- in_channel % group is not 0
 * @return 9 -- out_channel % group is not 0
 * @return 10 -- after calculation, output_length is not a positive number
 */
uint8_t conv1d_int8_to_int8_weight_8bit(int8_t *in, int8_t *out, 
    int8_t *weight, int32_t *bias, struct conv1d_config_bean *bean, 
    uint8_t output_right_shift);

/* @brief conv1d_int8_to_int8_weight_8bit() - 1-dimension conv operation as below:
          output[c, l] = (sum(input[i, l * stride + k * dilation - padding_left] * weight[c, i, k]) + bias[c] << bias_left_shift) >> output_right_shift
          where the sum is done on index i and k.
          saturation included, i.e. output elements out of range [-128, 127] will be set to -128 or 127
 * @param in: where the input, a 2D-tensor of size [in_channel, in_length], is put
 *            input is arranged in the form below, as an example, we set in_channel = 28, in_length = 10
 *            input[0, 0], input[1, 0], input[2, 0], ..., input[27, 0], 0, 0, 0, 0, 
 *            input[0, 1], input[1, 1], ..., input[27, 1], 0, 0, 0, 0, ..., 
 *            input[0, 9], ..., input[27, 9], 0, 0, 0, 0
 *            zeros are padded so that the distance between input[i, j] and input[i, j + 1] is ceil(in_channel / 16.0) * 16 bytes
 * @param out: where the output, a 2D-tensor, of size [out_channel, out_length] will be put, output will be arranged in the same form as input's
 *             where out_length = 1 + floor((in_length + padding_left + padding_right - (kernel_length - 1) * dilation - 1) / stride)
 * @param weight: where the weight is put.
 *                if in_channel, out_channel, and group are the same, weight is a 3D-tensor of size [out_channel, 1, kernel_length].
 *                it should be arranged in the form below, as an example, we set kernel_length = 4, in_channel = 28, out_channel = 28
 *                w[0, 0, 0], w[1, 0, 0], ..., w[15, 0, 0], w[0, 0, 1], w[1, 0, 1], ..., w[15, 0, 1], ..., w[0, 0, 3], ..., w[15, 0, 3],
 *                w[16, 0, 0], ..., w[27, 0, 0], 0, 0, 0, 0, w[16, 0, 1], ..., w[27, 0, 1], 0, 0, 0, 0, ..., w[16, 0, 3], ..., w[27, 0, 3], 0, 0, 0, 0
 *                zeros are padded so that the distance between w[i, 0, k] and w[i, 0, k + 1] is always 16 bytes
 *                otherwise, weight is a 3D-tensor of size [out_channel, in_channel / group, kernel_length], 
 *                it could be divided by groups, in each group, it is a 3D-tensor of size [out_channel / group, in_channel / group, kernel_length]
 *                weight should be arranged in the form below, as an example, we set kernel_length = 2, in_channel = 4, out_channel = 56, group = 2
 *                (noticing in_channel / group = 2, out_channel / group = 28, so w[0:28, :, :] is for the 1st group, w[28:55, :, :] is for the 2nd group)
 *                w[0, 0, 0], w[1, 0, 0], ..., w[15, 0, 0], w[0, 0, 1], w[1, 0, 1], ..., w[15, 0, 1], ..., w[0, 0, 3], ..., w[15, 0, 3],
 *                w[0, 1, 0], w[1, 1, 0], ..., w[15, 1, 0], w[0, 1, 1], w[1, 1, 1], ..., w[15, 1, 1], ..., w[0, 1, 3], ..., w[15, 1, 3],
 *                w[16, 0, 0], ..., w[27, 0, 0], 0, 0, 0, 0, w[16, 0, 1], ..., w[27, 0, 1], 0, 0, 0, 0, ..., w[16, 0, 3], ..., w[27, 0, 3], 0, 0, 0, 0
 *                w[16, 1, 0], ..., w[27, 1, 0], 0, 0, 0, 0, w[16, 1, 1], ..., w[27, 1, 1], 0, 0, 0, 0, ..., w[16, 1, 3], ..., w[27, 1, 3], 0, 0, 0, 0
 *                w[28, 0, 0], w[29, 0, 0], ..., w[43, 0, 0], w[28, 0, 1], w[29, 0, 1], ..., w[43, 0, 1], ..., w[28, 0, 3], ..., w[43, 0, 3],
 *                w[28, 1, 0], w[29, 1, 0], ..., w[43, 1, 0], w[28, 1, 1], w[29, 1, 1], ..., w[43, 1, 1], ..., w[28, 1, 3], ..., w[43, 1, 3],
 *                w[44, 0, 0], ..., w[55, 0, 0], 0, 0, 0, 0, w[44, 0, 1], ..., w[55, 0, 1], 0, 0, 0, 0, ..., w[44, 0, 3], ..., w[55, 0, 3], 0, 0, 0, 0
 *                w[44, 1, 0], ..., w[55, 1, 0], 0, 0, 0, 0, w[44, 1, 1], ..., w[55, 1, 1], 0, 0, 0, 0, ..., w[44, 1, 3], ..., w[55, 1, 3], 0, 0, 0, 0
 *                zeros are padded here to guarantee the distance between w[i, j, k] and w[i, j, k + 1] in memory is 16 bytes
 * @param bias: where the bias is put, if bias is not needed, this pointer should be set to NULL
 * @param bean: pointer to the config, whose description is shown in the beginning
 * @param output_right_shift: the right shift number shown in the description of this function
 * @param bias_left_shift: the left shift number shown in the description of this function
 * @return 0 -- succeed
 * @return 1 -- in_channel is 0
 * @return 2 -- out_channel is 0
 * @return 3 -- in_length is 0
 * @return 4 -- kernel_length is 0
 * @return 5 -- stride is 0
 * @return 6 -- dilation is 0
 * @return 7 -- group is 0
 * @return 8 -- in_channel % group is not 0
 * @return 9 -- out_channel % group is not 0
 * @return 10 -- after calculation, output_length is not a positive number
 */
uint8_t conv1d_int8_to_int8_weight_8bit_bias_shift(int8_t *in, int8_t *out, 
    int8_t *weight, int8_t *bias, struct conv1d_config_bean *bean, 
    uint8_t output_right_shift, uint8_t bias_left_shift);

/* @brief conv1d_int16_to_int16_weight_16bit() - 1-dimension conv operation as below:
          output[c, l] = (sum(input[i, l * stride + k * dilation - padding_left] * weight[c, i, k]) + bias[c]) >> output_right_shift
          where the sum is done on index i and k.
          saturation included, i.e. output elements out of range [-32768, 32767] will be set to -32768 or 32767
 * @param in: where the input, a 2D-tensor of size [in_channel, in_length], is put
 *            input is arranged in the form below, as an example, we set in_channel = 14, in_length = 10
 *            input[0, 0], input[1, 0], input[2, 0], ..., input[13, 0], 0, 0, 
 *            input[0, 1], input[1, 1], ..., input[13, 1], 0, 0, ..., 
 *            input[0, 9], ..., input[13, 9], 0, 0
 *            zeros are padded so that the distance between input[i, j] and input[i, j + 1] is ceil(in_channel / 8.0) * 16 bytes
 * @param out: where the output, a 2D-tensor, of size [out_channel, out_length] will be put, output will be arranged in the same form as input's
 *             where out_length = 1 + floor((in_length + padding_left + padding_right - (kernel_length - 1) * dilation - 1) / stride)
 * @param weight: where the weight is put.
 *                if in_channel, out_channel, and group are the same, weight is a 3D-tensor of size [out_channel, 1, kernel_length].
 *                it should be arranged in the form below, as an example, we set kernel_length = 4, in_channel = 14, out_channel = 14
 *                w[0, 0, 0], w[1, 0, 0], ..., w[7, 0, 0], w[0, 0, 1], w[1, 0, 1], ..., w[7, 0, 1], ..., w[0, 0, 3], ..., w[7, 0, 3],
 *                w[8, 0, 0], ..., w[13, 0, 0], 0, 0, 0, 0, w[8, 0, 1], ..., w[13, 0, 1], 0, 0, 0, 0, ..., w[8, 0, 3], ..., w[13, 0, 3], 0, 0, 0, 0
 *                zeros are padded so that the distance between w[i, 0, k] and w[i, 0, k + 1] is always 16 bytes
 *                otherwise, weight is a 3D-tensor of size [out_channel, in_channel / group, kernel_length], 
 *                it could be divided by groups, in each group, it is a 3D-tensor of size [out_channel / group, in_channel / group, kernel_length]
 *                weight should be arranged in the form below, as an example, we set kernel_length = 2, in_channel = 4, out_channel = 28, group = 2
 *                (noticing in_channel / group = 2, out_channel / group = 14, so w[0:14, :, :] is for the 1st group, w[14:27, :, :] is for the 2nd group)
 *                w[0, 0, 0], w[1, 0, 0], ..., w[7, 0, 0], w[0, 0, 1], w[1, 0, 1], ..., w[7, 0, 1], ..., w[0, 0, 3], ..., w[7, 0, 3],
 *                w[0, 1, 0], w[1, 1, 0], ..., w[7, 1, 0], w[0, 1, 1], w[1, 1, 1], ..., w[7, 1, 1], ..., w[0, 1, 3], ..., w[7, 1, 3],
 *                w[8, 0, 0], ..., w[13, 0, 0], 0, 0, w[8, 0, 1], ..., w[13, 0, 1], 0, 0, ..., w[8, 0, 3], ..., w[13, 0, 3], 0, 0
 *                w[8, 1, 0], ..., w[13, 1, 0], 0, 0, w[8, 1, 1], ..., w[13, 1, 1], 0, 0, ..., w[8, 1, 3], ..., w[13, 1, 3], 0, 0
 *                w[14, 0, 0], w[15, 0, 0], ..., w[21, 0, 0], w[14, 0, 1], w[15, 0, 1], ..., w[21, 0, 1], ..., w[14, 0, 3], ..., w[21, 0, 3],
 *                w[14, 1, 0], w[15, 1, 0], ..., w[21, 1, 0], w[14, 1, 1], w[15, 1, 1], ..., w[21, 1, 1], ..., w[14, 1, 3], ..., w[21, 1, 3],
 *                w[22, 0, 0], ..., w[27, 0, 0], 0, 0, w[22, 0, 1], ..., w[27, 0, 1], 0, 0, ..., w[22, 0, 3], ..., w[27, 0, 3], 0, 0
 *                w[22, 1, 0], ..., w[27, 1, 0], 0, 0, w[22, 1, 1], ..., w[27, 1, 1], 0, 0, ..., w[22, 1, 3], ..., w[27, 1, 3], 0, 0
 *                zeros are padded here to guarantee the distance between w[i, j, k] and w[i, j, k + 1] in memory is 16 bytes
 * @param bias: where the bias is put, if bias is not needed, this pointer should be set to NULL
 * @param bean: pointer to the config, whose description is shown in the beginning
 * @param output_right_shift: the right shift number shown in the description of this function
 * @return 0 -- succeed
 * @return 1 -- in_channel is 0
 * @return 2 -- out_channel is 0
 * @return 3 -- in_length is 0
 * @return 4 -- kernel_length is 0
 * @return 5 -- stride is 0
 * @return 6 -- dilation is 0
 * @return 7 -- group is 0
 * @return 8 -- in_channel % group is not 0
 * @return 9 -- out_channel % group is not 0
 * @return 10 -- after calculation, output_length is not a positive number
 */
uint8_t conv1d_int16_to_int16_weight_16bit(int16_t *in, int16_t *out, 
    int16_t *weight, int64_t *bias, struct conv1d_config_bean *bean, 
    uint8_t output_right_shift);

/* @brief conv1d_int16_to_int16_weight_16bit_bias_shift() - 1-dimension conv operation as below:
          output[c, l] = (sum(input[i, l * stride + k * dilation - padding_left] * weight[c, i, k]) + bias[c] << bias_left_shift) >> output_right_shift
          where the sum is done on index i and k.
          saturation included, i.e. output elements out of range [-32768, 32767] will be set to -32768 or 32767
 * @param in: where the input, a 2D-tensor of size [in_channel, in_length], is put
 *            input is arranged in the form below, as an example, we set in_channel = 14, in_length = 10
 *            input[0, 0], input[1, 0], input[2, 0], ..., input[13, 0], 0, 0, 
 *            input[0, 1], input[1, 1], ..., input[13, 1], 0, 0, ..., 
 *            input[0, 9], ..., input[13, 9], 0, 0
 *            zeros are padded so that the distance between input[i, j] and input[i, j + 1] is ceil(in_channel / 8.0) * 16 bytes
 * @param out: where the output, a 2D-tensor, of size [out_channel, out_length] will be put, output will be arranged in the same form as input's
 *             where out_length = 1 + floor((in_length + padding_left + padding_right - (kernel_length - 1) * dilation - 1) / stride)
 * @param weight: where the weight is put.
 *                if in_channel, out_channel, and group are the same, weight is a 3D-tensor of size [out_channel, 1, kernel_length].
 *                it should be arranged in the form below, as an example, we set kernel_length = 4, in_channel = 14, out_channel = 14
 *                w[0, 0, 0], w[1, 0, 0], ..., w[7, 0, 0], w[0, 0, 1], w[1, 0, 1], ..., w[7, 0, 1], ..., w[0, 0, 3], ..., w[7, 0, 3],
 *                w[8, 0, 0], ..., w[13, 0, 0], 0, 0, 0, 0, w[8, 0, 1], ..., w[13, 0, 1], 0, 0, 0, 0, ..., w[8, 0, 3], ..., w[13, 0, 3], 0, 0, 0, 0
 *                zeros are padded so that the distance between w[i, 0, k] and w[i, 0, k + 1] is always 16 bytes
 *                otherwise, weight is a 3D-tensor of size [out_channel, in_channel / group, kernel_length], 
 *                it could be divided by groups, in each group, it is a 3D-tensor of size [out_channel / group, in_channel / group, kernel_length]
 *                weight should be arranged in the form below, as an example, we set kernel_length = 2, in_channel = 4, out_channel = 28, group = 2
 *                (noticing in_channel / group = 2, out_channel / group = 14, so w[0:14, :, :] is for the 1st group, w[14:27, :, :] is for the 2nd group)
 *                w[0, 0, 0], w[1, 0, 0], ..., w[7, 0, 0], w[0, 0, 1], w[1, 0, 1], ..., w[7, 0, 1], ..., w[0, 0, 3], ..., w[7, 0, 3],
 *                w[0, 1, 0], w[1, 1, 0], ..., w[7, 1, 0], w[0, 1, 1], w[1, 1, 1], ..., w[7, 1, 1], ..., w[0, 1, 3], ..., w[7, 1, 3],
 *                w[8, 0, 0], ..., w[13, 0, 0], 0, 0, w[8, 0, 1], ..., w[13, 0, 1], 0, 0, ..., w[8, 0, 3], ..., w[13, 0, 3], 0, 0
 *                w[8, 1, 0], ..., w[13, 1, 0], 0, 0, w[8, 1, 1], ..., w[13, 1, 1], 0, 0, ..., w[8, 1, 3], ..., w[13, 1, 3], 0, 0
 *                w[14, 0, 0], w[15, 0, 0], ..., w[21, 0, 0], w[14, 0, 1], w[15, 0, 1], ..., w[21, 0, 1], ..., w[14, 0, 3], ..., w[21, 0, 3],
 *                w[14, 1, 0], w[15, 1, 0], ..., w[21, 1, 0], w[14, 1, 1], w[15, 1, 1], ..., w[21, 1, 1], ..., w[14, 1, 3], ..., w[21, 1, 3],
 *                w[22, 0, 0], ..., w[27, 0, 0], 0, 0, w[22, 0, 1], ..., w[27, 0, 1], 0, 0, ..., w[22, 0, 3], ..., w[27, 0, 3], 0, 0
 *                w[22, 1, 0], ..., w[27, 1, 0], 0, 0, w[22, 1, 1], ..., w[27, 1, 1], 0, 0, ..., w[22, 1, 3], ..., w[27, 1, 3], 0, 0
 *                zeros are padded here to guarantee the distance between w[i, j, k] and w[i, j, k + 1] in memory is 16 bytes
 * @param bias: where the bias is put, if bias is not needed, this pointer should be set to NULL
 * @param bean: pointer to the config, whose description is shown in the beginning
 * @param output_right_shift: the right shift number shown in the description of this function
 * @param bias_left_shift: the left shift number shown in the description of this function
 * @return 0 -- succeed
 * @return 1 -- in_channel is 0
 * @return 2 -- out_channel is 0
 * @return 3 -- in_length is 0
 * @return 4 -- kernel_length is 0
 * @return 5 -- stride is 0
 * @return 6 -- dilation is 0
 * @return 7 -- group is 0
 * @return 8 -- in_channel % group is not 0
 * @return 9 -- out_channel % group is not 0
 * @return 10 -- after calculation, output_length is not a positive number
 */
uint8_t conv1d_int16_to_int16_weight_16bit_bias_shift(int16_t *in, int16_t *out, 
    int16_t *weight, int16_t *bias, struct conv1d_config_bean *bean, 
    uint8_t output_right_shift, uint8_t bias_left_shift);

/**
  * @}
  */

#ifdef __cplusplus
}
#endif

#endif