kunlun/export/inc/cnn/cnn_nonlinear.h

/****************************************************************************

Copyright(c) 2019 by Aerospace C.Power (Chongqing) Microelectronics. ALL RIGHTS RESERVED.

This Information is proprietary to Aerospace C.Power (Chongqing) Microelectronics and MAY NOT
be copied by any method or incorporated into another program without
the express written consent of Aerospace C.Power. This Information or any portion
thereof remains the property of Aerospace C.Power. The Information contained herein
is believed to be accurate and Aerospace C.Power assumes no responsibility or
liability for its use in any way and conveys no license or title under
any patent or copyright and makes no representation or warranty that this
Information is free from patent or copyright infringement.

****************************************************************************/
#ifndef CNN_NONLINEAR_H
#define CNN_NONLINEAR_H
/* os shim includes */
#include "os_types_api.h"

#ifdef __cplusplus
extern "C" {
#endif

/** \defgroup cnn nonlinear API
  * @brief cnn nonlinear APIs
  * These functions could be used for calculating nonlinear operation, including relu, relu6, sigmoid, tanh, leaky_relu, prelu and softmax.
  * CNN engine will be used in these functions, except for cpu_prelu_ddr, cpu_prelu_before_depth, cpu_prelu_before_point, cpu_prelu_iram.
  * You might want to use functions in cnn_conv_2d.h to complete an operation containing convolution followed by bias add and nonliearty.
  * For any features, every column will occupy a size which is a multiple of 32 bytes. Here column means the last dimension of a tensor
  * For example, if a tensor of type int8_t named A is of size 4 * 30 * 300, it will be saved in the memory of this form so that every column occupies 320 bytes:
  * A[0][0][0], A[0][0][1], ..., A[0][0][299], 0, 0, 0, ..., 0, A[0][1][0], A[0][1][1], ..., A[0][1][299], 0, 0, 0, 0, ..., 0, ..., A[0][29][0], ..., A[0][29][299], 0, ..., 0, A[1][0][0], ...
  *
  *
  *
  */

/** @addtogroup cnn_nonlinear_APIs
  * @{
  */


/* @brief cnn_relu() - do a relu operation
 * @param input_addr: where the input is put
 * @param output_addr: where the output will be put
 * @param channel: channel of input
 * @param height: height of input
 * @param width: width of input
 * @param mac_8bit: non-zero - input and output are of type int16_t, zero - input and output are of type int8_t
 * @return 0 -- succeed
 * @return 1 -- input channel is 0
 * @return 2 -- input height is 0
 * @return 3 -- input width is 0
 */
uint8_t cnn_relu(uint32_t input_addr, uint32_t output_addr, uint16_t channel, uint16_t height, uint16_t width, uint8_t mac_8bit);

/* @brief cnn_relu6() - do a relu6 operation, the fraction of output is 4(int8 mode) or 12(int16 mode)
 * @param input_addr: where the input is put
 * @param output_addr: where the output will be put
 * @param channel: channel of input
 * @param height: height of input
 * @param width: width of input
 * @param fraction: the fraction of input, it is recommend that fraction be 4(int8 mode) or 12(int16 mode)
 * @param mac_8bit: non-zero - input and output are of type int16_t, zero - input and output are of type int8_t
 * @return 0 -- succeed
 * @return 1 -- input channel is 0
 * @return 2 -- input height is 0
 * @return 3 -- input width is 0
 */
uint8_t cnn_relu6(uint32_t input_addr, uint32_t output_addr, uint16_t channel, uint16_t height, uint16_t width, uint8_t mac_8bit, uint8_t fraction);

/* @brief cnn_sigmoid() - do a sigmoid operation, the fraction of output is 7(int8 mode) or 15(int16 mode)
 * @param input_addr: where the input is put
 * @param output_addr: where the output will be put
 * @param channel: channel of input
 * @param height: height of input
 * @param width: width of input
 * @param fraction: the fraction of input, it is recommend that fraction be 4(int8 mode) or 12(int16 mode)
 * @param mac_8bit: non-zero - input and output are of type int16_t, zero - input and output are of type int8_t
 * @return 0 -- succeed
 * @return 1 -- input channel is 0
 * @return 2 -- input height is 0
 * @return 3 -- input width is 0
 */
uint8_t cnn_sigmoid(uint32_t input_addr, uint32_t output_addr, uint16_t channel, uint16_t height, uint16_t width, uint8_t mac_8bit, uint8_t fraction);

/* @brief cnn_tanh() - do a tanh operation, the fraction of output is 7(int8 mode) or 15(int16 mode)
 * @param input_addr: where the input is put
 * @param output_addr: where the output will be put
 * @param channel: channel of input
 * @param height: height of input
 * @param width: width of input
 * @param fraction: the fraction of input, it is recommend that fraction be 5(int8 mode) or 13(int16 mode)
 * @param mac_8bit: non-zero - input and output are of type int16_t, zero - input and output are of type int8_t
 * @return 0 -- succeed
 * @return 1 -- input channel is 0
 * @return 2 -- input height is 0
 * @return 3 -- input width is 0
 */
uint8_t cnn_tanh(uint32_t input_addr, uint32_t output_addr, uint16_t channel, uint16_t height, uint16_t width, uint8_t mac_8bit, uint8_t fraction);

/* @brief cnn_leaky_relu() - do a leaky relu operation
 * @param input_addr: where the input is put
 * @param output_addr: where the output will be put
 * @param channel: channel of input
 * @param height: height of input
 * @param width: width of input
 * @param leaky_param: the multiplier of leaky relu, its fraction is 6
 * @param mac_8bit: non-zero - input and output are of type int16_t, zero - input and output are of type int8_t
 * @return 0 -- succeed
 * @return 1 -- input channel is 0
 * @return 2 -- input height is 0
 * @return 3 -- input width is 0
 */
uint8_t cnn_leaky_relu(uint32_t input_addr, uint32_t output_addr, uint16_t channel, uint16_t height, uint16_t width, uint8_t mac_8bit, int8_t leaky_param);

/* @brief cnn_prelu() - do a prelu operation
 * @param input_addr: where the input is put
 * @param output_addr: where the output will be put
 * @param channel: channel of input
 * @param height: height of input
 * @param width: width of input
 * @param prelu_param: the multiplier of prelu, its fraction is 6
 * @param mac_8bit: non-zero - input and output are of type int16_t, zero - input and output are of type int8_t
 * @param right_shift: non-zero - right shift output for 1bit, this is used for avoiding saturation
 * @return 0 -- succeed
 * @return 1 -- input channel is 0
 * @return 2 -- input height is 0
 * @return 3 -- input width is 0
 */
uint8_t cnn_prelu(uint32_t input_addr, uint32_t output_addr, uint16_t channel, uint16_t height, uint16_t width, uint8_t mac_8bit, int8_t *prelu_param, uint8_t right_shift);

/* @brief cpu_prelu_ddr() - do a prelu operation, you might need to flush cache to ensure that all the output are written into ddr
 * @param input_addr: where the input is put
 * @param output_addr: where the output will be put
 * @param channel: channel of input
 * @param height: height of input
 * @param width: width of input
 * @param prelu_param: the multiplier of prelu, its fraction is 6
 * @param mac_8bit: non-zero - input and output are of type int16_t, zero - input and output are of type int8_t
 * @param right_shift: non-zero - right shift output for 1bit, this is used for avoiding saturation
 * @return 0 -- succeed
 * @return 1 -- input channel is 0
 * @return 2 -- input height is 0
 * @return 3 -- input width is 0
 */
uint8_t cpu_prelu_ddr(uint32_t input_addr, uint32_t output_addr, uint16_t channel, uint16_t height, uint16_t width, uint8_t mac_8bit, int8_t *prelu_param, uint8_t right_shift);

/* @brief cpu_prelu_iram() - do a prelu operation, the input and output are in iram, of addr 0x73fe0000, this function is often used when previous output is put in iram and prelu is needed before next conv layer or pool layer
 * @param channel: channel of input
 * @param height: height of input
 * @param width: width of input
 * @param prelu_param: the multiplier of prelu, its fraction is 6
 * @param mac_8bit: non-zero - input and output are of type int16_t, zero - input and output are of type int8_t
 * @param right_shift: non-zero - right shift output for 1bit, this is used for avoiding saturation
 * @return 0 -- succeed
 * @return 1 -- input channel is 0
 * @return 2 -- input height is 0
 * @return 3 -- input width is 0
 */
uint8_t cpu_prelu_iram(uint16_t channel, uint16_t height, uint16_t width, uint8_t mac_8bit, int8_t *prelu_param, uint8_t right_shift);

/* @brief cpu_prelu_before_depth() - do a prelu operation, the input and output are in iram, of addr 0x73fc0000, this function is often used when previous output is put in out_sep_mode, and prelu is needed before next conv layer, which should be depthwise or pool layer
 * @param channel: channel of input
 * @param height: height of input
 * @param width: width of input
 * @param next_pad_up: the pad number of next layer on edge up
 * @param next_pad_left: the pad number of next layer of edge left
 * @param prelu_param: the multiplier of prelu, its fraction is 6
 * @param mac_8bit: non-zero - input and output are of type int16_t, zero - input and output are of type int8_t
 * @param right_shift: non-zero - right shift output for 1bit, this is used for avoiding saturation
 * @return 0 -- succeed
 * @return 1 -- input channel is 0
 * @return 2 -- input height is 0
 * @return 3 -- input width is 0
 */
uint8_t cpu_prelu_before_depth(uint16_t channel, uint16_t height, uint16_t width, uint8_t next_pad_up, uint8_t next_pad_left, uint8_t mac_8bit, int8_t *prelu_param, uint8_t right_shift);

/* @brief cpu_prelu_before_point() - do a prelu operation, the input and output are in iram, of addr 0x74020000, this function is often used when previous output is put in out_sep_mode and prelu is needed before next conv layer, which should be pointwise
 * @param channel: channel of input
 * @param height: height of input
 * @param width: width of input
 * @param prelu_param: the multiplier of prelu, its fraction is 6
 * @param mac_8bit: non-zero - input and output are of type int16_t, zero - input and output are of type int8_t
 * @param right_shift: non-zero - right shift output for 1bit, this is used for avoiding saturation
 * @return 0 -- succeed
 * @return 1 -- input channel is 0
 * @return 2 -- input height is 0
 * @return 3 -- input width is 0
 */
uint8_t cpu_prelu_before_point(uint16_t channel, uint16_t height, uint16_t width, uint8_t mac_8bit, int8_t *prelu_param, uint8_t right_shift);

/* @brief cnn_softmax() - do a softmax operation, the input tensor's shape should be channel * 1 * width, output tensor has the same size and output[i][0][j] = input[i][0][j] - ln(exp(input[0][0][j]) + exp(input[1][0][j] + ... + exp(input[-1][0][j]))), the fraction of output is 4(int8 mode) or 12(int16 mode), and the fraction of input should be 4(int8 mode) or 12(int16 mode)
 * @param input_addr: where the input is put
 * @param output_addr: where the output will be put
 * @param channel: channel of input
 * @param height: height of input
 * @param width: width of input
 * @param mac_8bit: non-zero - input and output are of type int16_t, zero - input and output are of type int8_t
 * @param weight_addr: the engine need to proceed a pointwise conv before the softmax computation, the weight should be of shape [channel, channel, 1, 1] and weight[i, j, 0, 0] = (i == j) ? 1 : 0
 * @return 0 -- succeed
 * @return 1 -- input channel is 0
 * @return 2 -- input height is 0
 * @return 3 -- input width is 0
 */
uint8_t cnn_softmax(uint32_t input_addr, uint32_t output_addr, uint16_t channel, uint16_t width, uint8_t mac_8bit, uint32_t weight_addr);


/**
  * @}
  */

#ifdef __cplusplus
}
#endif

#endif