175 lines
4.4 KiB
C++
175 lines
4.4 KiB
C++
// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// fixedpoint_neon.h: optimized NEON specializations of the templates
|
|
// in fixedpoint.h.
|
|
|
|
#ifndef GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
|
|
#define GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
|
|
|
|
#include <arm_neon.h>
|
|
|
|
namespace gemmlowp {
|
|
|
|
template <>
|
|
struct FixedPointRawTypeTraits<int32x4_t> {
|
|
typedef std::int32_t ScalarRawType;
|
|
static const int kLanes = 4;
|
|
};
|
|
|
|
template <>
|
|
inline int32x4_t BitAnd(int32x4_t a, int32x4_t b) {
|
|
return vandq_s32(a, b);
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t BitOr(int32x4_t a, int32x4_t b) {
|
|
return vorrq_s32(a, b);
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t BitXor(int32x4_t a, int32x4_t b) {
|
|
return veorq_s32(a, b);
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t BitNot(int32x4_t a) {
|
|
return veorq_s32(a, vdupq_n_s32(-1));
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t Add(int32x4_t a, int32x4_t b) {
|
|
return vaddq_s32(a, b);
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t Sub(int32x4_t a, int32x4_t b) {
|
|
return vsubq_s32(a, b);
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t Neg(int32x4_t a) {
|
|
return vnegq_s32(a);
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t ShiftLeft(int32x4_t a, int offset) {
|
|
return vshlq_s32(a, vdupq_n_s32(offset));
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t ShiftRight(int32x4_t a, int offset) {
|
|
return vshlq_s32(a, vdupq_n_s32(-offset));
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t SelectUsingMask(int32x4_t if_mask, int32x4_t then_val,
|
|
int32x4_t else_val) {
|
|
return vbslq_s32(vreinterpretq_u32_s32(if_mask), then_val, else_val);
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t MaskIfEqual(int32x4_t a, int32x4_t b) {
|
|
return vreinterpretq_s32_u32(vceqq_s32(a, b));
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t MaskIfNotEqual(int32x4_t a, int32x4_t b) {
|
|
return BitNot(MaskIfEqual(a, b));
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t MaskIfZero(int32x4_t a) {
|
|
return MaskIfEqual(a, vdupq_n_s32(0));
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t MaskIfNonZero(int32x4_t a) {
|
|
return vreinterpretq_s32_u32(vtstq_s32(a, a));
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t MaskIfGreaterThan(int32x4_t a, int32x4_t b) {
|
|
return vreinterpretq_s32_u32(vcgtq_s32(a, b));
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t MaskIfGreaterThanOrEqual(int32x4_t a, int32x4_t b) {
|
|
return vreinterpretq_s32_u32(vcgeq_s32(a, b));
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t MaskIfLessThan(int32x4_t a, int32x4_t b) {
|
|
return vreinterpretq_s32_u32(vcltq_s32(a, b));
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t MaskIfLessThanOrEqual(int32x4_t a, int32x4_t b) {
|
|
return vreinterpretq_s32_u32(vcleq_s32(a, b));
|
|
}
|
|
|
|
template <>
|
|
inline bool All(int32x4_t a) {
|
|
a = vandq_s32(a, vextq_s32(a, a, 1));
|
|
a = vandq_s32(a, vextq_s32(a, a, 2));
|
|
return vgetq_lane_s32(a, 0);
|
|
}
|
|
|
|
template <>
|
|
inline bool Any(int32x4_t a) {
|
|
a = vorrq_s32(a, vextq_s32(a, a, 1));
|
|
a = vorrq_s32(a, vextq_s32(a, a, 2));
|
|
return vgetq_lane_s32(a, 0);
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t RoundingHalfSum(int32x4_t a, int32x4_t b) {
|
|
return vrhaddq_s32(a, b);
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t SaturatingRoundingDoublingHighMul(int32x4_t a, int32x4_t b) {
|
|
return vqrdmulhq_s32(a, b);
|
|
}
|
|
|
|
template <>
|
|
inline int32x4_t RoundingDivideByPOT(int32x4_t x, int exponent) {
|
|
const int32x4_t shift_vec = vdupq_n_s32(-exponent);
|
|
const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
|
|
const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
|
|
return vrshlq_s32(fixed_up_x, shift_vec);
|
|
}
|
|
|
|
template <int Exponent>
|
|
struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, 1> {
|
|
static int32x4_t eval(int32x4_t x) { return vqshlq_n_s32(x, Exponent); }
|
|
};
|
|
|
|
template <int Exponent>
|
|
struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, -1> {
|
|
static int32x4_t eval(int32x4_t x) {
|
|
const int32x4_t fixup = vshrq_n_s32(x, 31);
|
|
const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
|
|
return vrshrq_n_s32(fixed_up_x, -Exponent);
|
|
}
|
|
};
|
|
|
|
template <>
|
|
inline int32x4_t Dup<int32x4_t>(std::int32_t x) {
|
|
return vdupq_n_s32(x);
|
|
}
|
|
|
|
} // end namespace gemmlowp
|
|
|
|
#endif // GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
|