kunlun/export/inc/cnn/cnn_conv_2d.h

/****************************************************************************

Copyright(c) 2019 by Aerospace C.Power (Chongqing) Microelectronics. ALL RIGHTS RESERVED.

This Information is proprietary to Aerospace C.Power (Chongqing) Microelectronics and MAY NOT
be copied by any method or incorporated into another program without
the express written consent of Aerospace C.Power. This Information or any portion
thereof remains the property of Aerospace C.Power. The Information contained herein
is believed to be accurate and Aerospace C.Power assumes no responsibility or
liability for its use in any way and conveys no license or title under
any patent or copyright and makes no representation or warranty that this
Information is free from patent or copyright infringement.

****************************************************************************/
#ifndef CNN_CONV_2D_H
#define CNN_CONV_2D_H
/* os shim includes */
#include "os_types_api.h"

#ifdef __cplusplus
extern "C" {
#endif

/** \defgroup cnn conv2d API
  * @brief cnn conv2d APIs
  * These functions could be used for calculating a 2-dimension conv or add bias to a tensor.
  * For any features, every column will occupy a size which is a multiple of 32 bytes. Here column means the last dimension of a tensor
  * For example, if a tensor of type int8_t named A is of size 4 * 30 * 300, it will be saved in the memory of this form so that every column occupies 320 bytes:
  * A[0][0][0], A[0][0][1], ..., A[0][0][299], 0, 0, 0, ..., 0, A[0][1][0], A[0][1][1], ..., A[0][1][299], 0, 0, 0, 0, ..., 0, ..., A[0][29][0], ..., A[0][29][299], 0, ..., 0, A[1][0][0], ...
  * Weight should be arranged to a specific form, you can use the function cnn_arrange_weight_* to arrange the weight in ddr(these functions are in cnn_tensor_util.h), or arrange it before being loaded into ddr.
  * Set out_sep_mode to 1 can accelerate the calculate process, however, in this mode, cpu cannot access the output, and these conditions below must be satisfied:
  *     (1) this layer should be either depthwise or pointwise, innerproduct (or fc) could be regarded as pointwise, pooling could be regarded as depthwise
  *     (2) if this layer is pointwise, next layer should be depthwise; if this layer is depthwise, next layer should be pointwise
  *     (3) width of output plus left padding of next layer should be no larger than 16 bytes
  *     (4) stride of next layer should be 1
  *     (5) (height + next_pad_up) * channel of output should be no larger than 8192
  * If out_sep_mode of previous layer is set to 1, in_sep_mode of this layer should be 1
  *
  *
  */

/** @addtogroup cnn_conv2d_APIs
  * @{
  */


enum nonlinearty_flag {
    none = 0,
    sigmoid = 1,
    relu = 2,
    tanh_ = 3,
    leaky_relu = 5,
    relu6 = 6,
    prelu = 7
};

struct conv_2d_config_bean
{
    uint16_t in_channel; // how many channels of input
    uint16_t in_height; // height of input
    uint16_t in_width; // output of input
    uint16_t out_height; // don't need to be setted, this is used to save the height of output when the calculation finishes
    uint16_t out_width; // don't need to be setted, this is used to save the height of output when the calculation finishes
    uint16_t out_channel; // how many channels of output, if you want to run a depthwise conv, you don't need to set it
    uint16_t group; // how many groups, if you want to run a depthwise conv, you don't need to set it
    uint8_t kernel_size_h; // height size of the convolving kernel
    uint8_t kernel_size_w; // width size of the convolving kernel
    uint8_t stride;
    uint8_t dilation;
    uint8_t bias_en; // whether add bias when calculating conv
    uint8_t softmax; // whether calculate softmax after calculating conv and activation function
    uint8_t mac_8bit; // non_zero: 8bit mode; zero: 16bit mode
    int8_t padding_to_same; // non-zero: conv output size will be padded to ceil((input size + 2 * padding)/stride), zero : conv output size will be calculated by config
    uint8_t padding; // zero-padding added to both sides of the input
    uint8_t input_signed; // whether input is signed
    uint8_t weight_bias_signed; // whether weight and bias are signed
    uint8_t filter_lsb_channelwise; // whether filter lsb differ from channels
    uint8_t acc_out_shift; // the right shift bits of the output of acc array, it should be input_fraction + weight_fraction - output_fraction. Only valid when filter_lsb_channelwise is 0
    uint8_t bias_shift; // the left shift bits of bias when being added to the acc, it should be input_fraction + weight_fraction - bias_fraction. Only valid when filter_lsb_channelwise is 0
    int8_t* prelu_param; // the params of prelu, the LSB is 2^(-6)
    int8_t leaky_param; // the multiplier of leaky relu, the LSB is 2^(-6)
    uint8_t input_iram; // nonzero - read input from iram, 0 - read input from ddr, only valid if input_dpdp_chain is 0
    uint8_t output_iram; // nonzero - put output into iram, 0 - put output into ddr, only valid if output_dpdp_chain is 0
    uint8_t in_sep_mode; // whether read input from iram as separable conv mode
    uint8_t out_sep_mode; // whether put output into iram as separable conv mode
    uint8_t next_padding_left; // left pad of next layer, you need to set this number manually if you want next layer of type "padding to same", only valid when out_sep_mode is not 0
    uint8_t next_padding_up; // up pad of next layer, you need to set this number manually if you want next layer of type "padding to same", only valid when out_sep_mode is not 0
    uint8_t next_padding_type; // only valid when out_sep_mode is not 0, if the next layer is maxpool, please set it to 1, or set it to 0
	enum nonlinearty_flag nonlinearty; // the nonlinearty operation after conv, you can choose "none", "sigmoid", "relu", "tanh", "relu6", "leaky relu", "prelu", if you choose prelu, the out_channel should be no larger than 16 and the group should be 1, or you need to seperate your filter, and do several conv commands to complete the whole conv calculation
};

/* @brief cnn_conv2d() - do a convolution process using cnn engine, it is necessary that input_addr, output_addr and weight_addr are multiple of 256
 * @param input_addr: where the input is put
 * @param output_addr: where the output will be put
 * @param weight_addr: where the weight(including bias and shift numbers if necessary)
 * @param config: config info of the pooling process
 * @return 0 -- succeed
 * @return 1 -- input channel is smaller than group
 * @return 2 -- input height is 0
 * @return 3 -- input width is 0
 * @return 4 -- output channel is smaller than group
 * @return 5 -- input height is so small that output height will be 0
 * @return 6 -- input width is so small that output width will be 0
 * @return 7 -- kernel is 0
 * @return 8 -- kernel is too large
 * @return 9 -- stride is 0
 * @return 10 -- stride is too large
 * @return 11 -- dilation is 0
 * @return 12 -- dilation is too large
 * @return 13 -- group is 0
 * @return 14 -- group is larger than 1 so prelu is not supported
 * @return 15 -- output channel is larger than 16 so prelu is not supported
 */
uint8_t cnn_conv2d(uint32_t input_addr, uint32_t output_addr,
    uint32_t weight_addr, struct conv_2d_config_bean *config);

/* @brief cnn_conv2d_depthwise() - do a depthwise convolution process using cnn engine, it is necessary that input_addr, output_addr and weight_addr are multiple of 256
 * @param input_addr: where the input is put
 * @param output_addr: where the output will be put
 * @param weight_addr: where the weight(including bias and shift numbers if necessary)
 * @param config: config info of the pooling process
 * @return 0 -- succeed
 * @return 1 -- input channel is 0
 * @return 2 -- input height is 0
 * @return 3 -- input width is 0
 * @return 5 -- input height is so small that output height will be 0
 * @return 6 -- input width is so small that output width will be 0
 * @return 7 -- kernel is 0
 * @return 8 -- kernel is too large
 * @return 9 -- stride is 0
 * @return 10 -- stride is too large
 * @return 11 -- dilation is 0
 * @return 12 -- dilation is too large
 * @return 14 -- group is larger than 1 so prelu is not supported
 * @return 15 -- output channel is larger than 16 so prelu is not supported
 */
uint8_t cnn_conv2d_depthwise(uint32_t input_addr, uint32_t output_addr,
    uint32_t weight_addr, struct conv_2d_config_bean *config);

/* @brief cnn_bias_add() - do a bias add process using cnn engine, it is necessary that input_addr, output_addr and weight_addr are multiple of 256, it is NOT recommended to use this function since bias can be added when conv is calculated in function conv_2d or conv_2d depthwise
 * @param input_addr: where the input is put
 * @param output_addr: where the output will be put
 * @param bias_addr: where the bias is put, bias has to be put in ddr in this form: B[0], 1, 0, 0, ..., 0, B[1], 1, 0, 0, ..., 0, B[2], 1, 0, ..., 0, ..., B[channel - 1], 1, 0, 0, ..., 0. (For int8, the length of zeros between 1 and B[] should be 30, and for int16, the length should be 14)
 * @param channel: channel of input
 * @param height: height of input
 * @param width: width of input
 * @param mac_8bit: non-zero - input and output are of type int8_t, zero - input and output are of type int16_t
 * @param nonlinear: the nonlinear operation after bias add, can't be prelu
 * @return 0 -- succeed
 * @return 1 -- input channel is 0
 * @return 2 -- input height is 0
 * @return 3 -- input width is 0
 * @return 4 -- can't support prelu here
 */
uint8_t cnn_bias_add(uint32_t input_addr, uint32_t output_addr,
    uint32_t bias_addr, uint16_t channel, uint16_t height, uint16_t width,
    uint8_t mac_8bit, uint8_t input_signed, uint8_t bias_signed,
    uint8_t right_shift, enum nonlinearty_flag nonlinear);

/* @brief cnn_multi_const() - multiply input tensor by a constant and then add bias
 * @param input_addr: where the input is put
 * @param height: height of input
 * @param width: width of input
 * @param multi_const: the constant wanted as the multiplier
 * @param bias: the bias added after the multiplying
 * @param output_addr: where the output will be put
 * @param mac_8bit: non-zero - input and output are of type int8_t, zero - input and output are of type int16_t
 * @param input_signed: non-zero - input is unsigned number, zero - input is signed number, only valid when mac_8bits is 0, otherwise input will be seen as signed number
 * @param weight_bias_signed: non-zero - weight and bias are unsigned number, zero - weight and bias are signed number, only valid when mac_8bits are 0, otherwise weight and bias will be seen as signed number
 * @param bias_left_shift: the left shift bits of bias when being added
 * @param out_right_shift: the right shift bits of output, this could be used for avoiding saturation
 * @param nonlinear: the nonlinear operation after bias add, can't be prelu or leaky_relu
 * @return 0 -- succeed
 * @return 1 -- input address is not a multiple of 32
 * @return 2 -- input height is 0
 * @return 3 -- input width is 0
 * @return 4 -- output address is not a multiple of 32
 */
uint8_t cnn_multi_const(uint32_t input_addr, uint32_t height, uint32_t width,
    int32_t multi_const, int32_t bias, uint32_t output_addr, uint8_t mac_8bit,
    uint8_t input_signed, uint8_t weight_bias_signed, uint8_t bias_left_shift,
    uint8_t out_right_shift, enum nonlinearty_flag nonlinear);
/**
  * @}
  */


#ifdef __cplusplus
}
#endif

#endif