allwinner_a64/android/external/gemmlowp/standalone/neon-gemm-kernel-benchmark.cc

// Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// This is a standalone testbed and benchmark for gemmlowp-style GEMM kernels,
// either doing integer or float arithmetic.
// It verifies that a kernel produces correct results, then benchmarks it.
//
// Some benchmark results are recorded in this spreadsheet:
//
// https://docs.google.com/spreadsheets/d/1UPbzbp9rdsD6RXxOr5q6AZ0n1omgEknLYO2ogiw6Kqk/edit?usp=sharing
//
// This program is entirely self-contained, and can be compiled manually
// such as suggested in the command lines below.
// It currently supports only Android/ARM but would trivially generalize to
// other OSes (it's mostly standard POSIX) or architectures (each kernel
// targets a specific architecture, one may simply add more).

/*
 Build and run this benchmark on Android/ARM/32bit:
 ~/android/toolchains/arm-linux-androideabi/bin/arm-linux-androideabi-clang++ \
 -fPIE -pie -O3 --std=c++11 standalone/neon-gemm-kernel-benchmark.cc -o \
 /tmp/benchmark -mfloat-abi=softfp -mfpu=neon-vfpv4 && adb push /tmp/benchmark \
 /data/local/tmp && adb shell /data/local/tmp/benchmark
 Build and run this benchmark on Android/ARM/64bit:
 ~/android/toolchains/aarch64-linux-android/bin/aarch64-linux-android-clang++ \
 -fPIE -static -O3 --std=c++11 standalone/neon-gemm-kernel-benchmark.cc -o \
 /tmp/benchmark && adb push /tmp/benchmark /data/local/tmp && adb shell \
 /data/local/tmp/benchmark
 */

// For big.LITTLE devices, use 'taskset' to select which cores to benchmark.
//
// The syntax is: taskset <mask> <commandline>
// where mask is a binary mask where each bit corresponds to a core,
// and low bits are little cores.
//
// Examples:
// Nexus 5X big cores: taskset 30
// Nexus 5X little cores: taskset 0f
// Pixel XL big cores: taskset 0c
// Pixel XL little cores: taskset 03
//
// Full example:
// adb shell taskset 0c /data/local/tmp/benchmark

#include <sched.h>
#include <unistd.h>

#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <iostream>
#include <random>
#include <type_traits>

#if !defined __arm__ && !defined __aarch64__
#error This benchmark assumes ARM (for inline assembly sections).
#endif

#include <arm_neon.h>

// Typically one wants to fit in L1 cache, and GEMM implementations
// are carefully optimized to tune their access patterns to that effect.
// Most devices have at least 16k of L1 cache. The Kraits have exactly 16k.
const int kDefaultCacheSizeK = 16;

const int kCacheLineSize = 64;

// These definitions are used for labels within assembly code. Required for
// iOS toolchain compatibility.
#define GEMMLOWP_LABEL_AFTER_LOOP "1"
#define GEMMLOWP_LABEL_LOOP "2"
#define GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES "3"
#define GEMMLOWP_LABEL_STORE "4"

// BEGIN code copied from gemmlowp/internal/kernel.h

// Explanation of general gemmlowp terminology
// ===========================================
//
// We use the following abbreviations:
// LHS = "left-hand side"
// RHS = "right-hand side"
// Sometimes when referring to either LHS or RHS, we just say a "Side".
//
// In a matrix product of a MxK matrix times a KxN matrix,
// we call K the 'depth'. Note that M is the number of rows
// of the result (and of the LHS), and N is the number of columns
// of the result (and of the RHS).
//
// In each of the LHS and RHS matrices, we call 'width' the
// other dimension, besides the depth. So in the LHS, 'width'
// is the number of rows, while in the RHS, 'width' is the number
// of columns.
//
//  So in the LHS MxK matrix, the depth is K and the width in M.
// And in the RHS KxN matrix, the depth is K and the width in N.
//
// This is illustrated in this picture:
//
//                             RHS width
//                        <----------------->
//                        +-----------------+ ^
//                        |       RHS       | | Depth
//                        +-----------------+ v
//                 ^ +--+ +-----------------+
//                 | |L | |                 |
//       LHS width | |H | |      Result     |
//                 | |S | |                 |
//                 v +--+ +-----------------+
//                   <-->
//                   Depth

// Explanation of gemmlowp kernel formats and "cells"
// ==================================================
//
// Kernels operate on small LHS and RHS blocks that fit in registers.
// These blocks are stored contiguously in memory, but not always
// in a traditional column-major or row-major order; instead,
// they consist of a number of sub-blocks, which we call "cells",
// that are stored in column-major or row-major order. However,
// what really matters to us is not so much rows vs columns, but
// rather width vs depth. So we refer to "width-major" and "depth-major"
// storage orders. In the LHS, width-major means row-major,
// while in the RHS, width-major means column-major.
// There is also a third possibility, "diagonal order",
// which is unused at the moment.
//
// We aim to treat both sides, LHS and RHS, on an equal footing,
// so we call them both 'sides'. A KernelFormat thus is just a pair
// of KernelSideFormat's, one for LHS and one for RHS; each KernelSideFormat
// contains a CellFormat and a number of cells; cells are only ever
// stacked in the width dimension, which means stacked vertically in the
// LHS and stacked horizondally in the RHS.
//
// Example
// =======
//
// Let's work out the data layout expected by a kernel having the
// following format (the struct names here are defined below in this file):
//
// KernelFormat<
//   KernelSideFormat<CellFormat<3, 4>, 3>,
//   KernelSideFormat<CellFormat<5, 4>, 2>
// >
//
// The LHS format, KernelSideFormat<CellFormat<3, 4>, 3>, means:
// 3 cells, each cell having dimensions (width=3, depth=4), laid out in
// DepthMajor order (the default value, see CellFormat). In the LHS,
// DepthMajor means column-major, so the LHS cells are of size 3x4 in
// column-major order, so the LHS layout is:
//
// 0  3  6  9
// 1  4  7  10
// 2  5  8  11
// 12 15 18 21
// 13 16 19 22
// 14 17 20 23
// 24 27 30 33
// 25 28 31 34
// 26 29 32 35
//
// The RHS format, KernelSideFormat<CellFormat<5, 4>, 2>, means:
// 2 cells each having dimensions (width=5, depth=4), laid out in
// DepthMajor order (the default value, see CellFormat). In the RHS,
// DepthMajor means row-major, so the RHS cells are of size 4x5 in
// row-major order, so the RHS layout is:
//
// 0  1  2  3  4  20 21 22 23 24
// 5  6  7  8  9  25 26 27 28 29
// 10 11 12 13 14 30 31 32 33 34
// 15 16 17 18 19 35 36 37 38 39

// CellOrder enumerates the possible storage orders (=layouts) for
// a cell (see explanation above).
enum class CellOrder { DepthMajor, WidthMajor, Diagonal };

// CellFormat describes how data is laid
// out in a cell. That is, a CellOrder together with actual dimensions.
template <int tWidth, int tDepth, CellOrder tOrder>
struct CellFormat {
  static const int kWidth = tWidth;
  static const int kDepth = tDepth;
  static const CellOrder kOrder = tOrder;

  static const int kSize = kWidth * kDepth;
};

// KernelSideFormat describes how data is laid out in a kernel side
// (i.e. LHS or RHS). That is, a CellFormat together with a number of
// cells. These cells are always stacked in the Width dimension.
// For example, in the LHS case, the Width dimension is the rows dimension,
// se we're saying that in the LHS, cells are stacked vertically.
// We never stack cells in the Depth dimension.
template <typename tCellFormat, int tCells>
struct KernelSideFormat {
  typedef tCellFormat Cell;
  static const int kCells = tCells;
  static const int kWidth = kCells * Cell::kWidth;
  static const int kDepth = Cell::kDepth;
};

// KernelFormat describes fully the input data layout that a kernel expects.
// It consists of two KernelSideFormat's, one for LHS and one for RHS.
template <typename tLhs, typename tRhs>
struct KernelFormat {
  typedef tLhs Lhs;
  typedef tRhs Rhs;

  static_assert(Lhs::Cell::kDepth == Rhs::Cell::kDepth, "");
  static const int kDepth = Lhs::Cell::kDepth;
  static const int kRows = Lhs::Cell::kWidth * Lhs::kCells;
  static const int kCols = Rhs::Cell::kWidth * Rhs::kCells;
};

inline const char* CellOrderName(CellOrder o) {
  switch (o) {
    case CellOrder::DepthMajor:
      return "DepthMajor";
    case CellOrder::WidthMajor:
      return "WidthMajor";
    case CellOrder::Diagonal:
      return "Diagonal";
    default:
      assert(false);
      return nullptr;
  }
}

// Returns the offset into a cell, at which a given coefficient is stored.
template <typename CellFormat>
inline int OffsetIntoCell(int w, int d) {
  switch (CellFormat::kOrder) {
    case CellOrder::DepthMajor:
      return w + d * CellFormat::kWidth;
    case CellOrder::WidthMajor:
      return d + w * CellFormat::kDepth;
    case CellOrder::Diagonal:
      assert(CellFormat::kWidth == CellFormat::kDepth);
      static const int size = CellFormat::kWidth;
      return ((size + w - d) * size + d) % (size * size);
    default:
      assert(false);
      return 0;
  }
}

// END code copied from gemmlowp/internal/kernel.h

#ifdef __arm__

// This is the current standard kernel in gemmlowp, see:
// https://github.com/google/gemmlowp/blob/b1e2a29ff866680028f3080efc244e10e8dd7f46/internal/kernel_neon.h#L33
struct NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators {
  typedef std::uint8_t OperandType;
  typedef std::uint32_t AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>,
      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 1> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    asm volatile(
        // Load 1 Rhs cell of size 2x4
        "vld1.8 {d0}, [%[rhs_ptr]]!\n"
        // Load 3 Lhs cells of size 4x2 each
        "vld1.8 {d2}, [%[lhs_ptr]]!\n"
        "vld1.8 {d4}, [%[lhs_ptr]]!\n"
        "vld1.8 {d6}, [%[lhs_ptr]]!\n"
        // Load accumulators
        "mov r0, %[accum_ptr]\n"
        "vld1.32 {d8, d9},   [r0]!\n"
        "vld1.32 {d16, d17}, [r0]!\n"
        "vld1.32 {d24, d25}, [r0]!\n"
        "vld1.32 {d10, d11}, [r0]!\n"
        "vld1.32 {d18, d19}, [r0]!\n"
        "vld1.32 {d26, d27}, [r0]!\n"
        "vld1.32 {d12, d13}, [r0]!\n"
        "vld1.32 {d20, d21}, [r0]!\n"
        "vld1.32 {d28, d29}, [r0]!\n"
        "vld1.32 {d14, d15}, [r0]!\n"
        "vld1.32 {d22, d23}, [r0]!\n"
        "vld1.32 {d30, d31}, [r0]!\n"

        "subs %[depth], #2\n"

        "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n"

        GEMMLOWP_LABEL_LOOP
        ":\n"
        // Overview of register layout:
        //
        // A 2x4 cell of Rhs is stored in 16bit in d0--d1 (q0).
        // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in d2--d7
        // (q1--q3).
        // A 12x4 block of accumulators is stored in 32bit in q4--q15.
        //
        //                   +-----+-----+-----+-----+
        //                   |d0[0]|d0[1]|d0[2]|d0[3]|
        //              Rhs  +-----+-----+-----+-----+
        //                   |d1[0]|d1[1]|d1[2]|d1[3]|
        //                   +-----+-----+-----+-----+
        //
        //                   |     |     |     |     |
        //
        //    Lhs            |     |     |     |     |
        //
        //  +--+--+ - - - -  +-----+-----+-----+-----+
        //  |d2|d3|          | q4  | q5  | q6  | q7  |
        //  |d2|d3|          | q4  | q5  | q6  | q7  |
        //  |d2|d3|          | q4  | q5  | q6  | q7  |
        //  |d2|d3|          | q4  | q5  | q6  | q7  |
        //  +--+--+ - - - -  +-----+-----+-----+-----+
        //  |d4|d5|          | q8  | q9  | q10 | q11 |
        //  |d4|d5|          | q8  | q9  | q10 | q11 |
        //  |d4|d5|          | q8  | q9  | q10 | q11 |
        //  |d4|d5|          | q8  | q9  | q10 | q11 |
        //  +--+--+ - - - -  +-----+-----+-----+-----+
        //  |d6|d7|          | q12 | q13 | q14 | q15 |
        //  |d6|d7|          | q12 | q13 | q14 | q15 |
        //  |d6|d7|          | q12 | q13 | q14 | q15 |
        //  |d6|d7|          | q12 | q13 | q14 | q15 |
        //  +--+--+ - - - -  +-----+-----+-----+-----+
        //
        //                            Accumulator

        // Expand Lhs/Rhs cells to 16 bit.
        // Note: moving theses vmovls further down to allow for
        // longer data pipelining helps a little on A57 but is
        // harmful on A53 --- It looks as if A53 doesn't like
        // interleaving vmovl's into the vmlal's.
        "vmovl.u8 q0, d0\n"
        "vmovl.u8 q1, d2\n"
        "vmovl.u8 q2, d4\n"
        "vmovl.u8 q3, d6\n"

        // Multiply-accumulate, level of depth 0
        "vmlal.u16 q4, d2, d0[0]\n"
        "vmlal.u16 q5, d2, d0[1]\n"
        "vmlal.u16 q6, d2, d0[2]\n"
        "vmlal.u16 q7, d2, d0[3]\n"
        "vldr d2, [%[lhs_ptr]]\n"
        "vmlal.u16 q8, d4, d0[0]\n"
        "vmlal.u16 q9, d4, d0[1]\n"
        "vmlal.u16 q10, d4, d0[2]\n"
        "vmlal.u16 q11, d4, d0[3]\n"
        "vldr d4, [%[lhs_ptr], #8]\n"
        "vmlal.u16 q12, d6, d0[0]\n"
        "vmlal.u16 q13, d6, d0[1]\n"
        "vmlal.u16 q14, d6, d0[2]\n"
        "vmlal.u16 q15, d6, d0[3]\n"
        "vldr d6, [%[lhs_ptr], #16]\n"
        "vldr d0, [%[rhs_ptr]]\n"

        // Multiply-accumulate, level of depth 1
        "vmlal.u16 q4, d3, d1[0]\n"
        "vmlal.u16 q5, d3, d1[1]\n"
        "add %[lhs_ptr], #24\n"
        "vmlal.u16 q6, d3, d1[2]\n"
        "vmlal.u16 q7, d3, d1[3]\n"
        "add %[rhs_ptr], #8\n"
        "vmlal.u16 q8, d5, d1[0]\n"
        "vmlal.u16 q9, d5, d1[1]\n"
        "subs %[depth], #2\n"
        "vmlal.u16 q10, d5, d1[2]\n"
        "vmlal.u16 q11, d5, d1[3]\n"
        "vmlal.u16 q12, d7, d1[0]\n"
        "vmlal.u16 q13, d7, d1[1]\n"
        "vmlal.u16 q14, d7, d1[2]\n"
        "vmlal.u16 q15, d7, d1[3]\n"

        "bne " GEMMLOWP_LABEL_LOOP "b\n"

        GEMMLOWP_LABEL_AFTER_LOOP
        ":\n"

        // Expand Lhs/Rhs cells to 16 bit.
        "vmovl.u8 q0, d0\n"
        "vmovl.u8 q1, d2\n"
        "vmovl.u8 q2, d4\n"
        "vmovl.u8 q3, d6\n"

        // Multiply-accumulate, level of depth 0
        "vmlal.u16 q4, d2, d0[0]\n"
        "vmlal.u16 q5, d2, d0[1]\n"
        "vmlal.u16 q6, d2, d0[2]\n"
        "vmlal.u16 q7, d2, d0[3]\n"
        "vmlal.u16 q8, d4, d0[0]\n"
        "vmlal.u16 q9, d4, d0[1]\n"
        "vmlal.u16 q10, d4, d0[2]\n"
        "vmlal.u16 q11, d4, d0[3]\n"
        "vmlal.u16 q12, d6, d0[0]\n"
        "vmlal.u16 q13, d6, d0[1]\n"
        "vmlal.u16 q14, d6, d0[2]\n"
        "vmlal.u16 q15, d6, d0[3]\n"

        // Multiply-accumulate, level of depth 1
        "vmlal.u16 q4, d3, d1[0]\n"
        "vmlal.u16 q5, d3, d1[1]\n"
        "vmlal.u16 q6, d3, d1[2]\n"
        "vmlal.u16 q7, d3, d1[3]\n"
        "vmlal.u16 q8, d5, d1[0]\n"
        "vmlal.u16 q9, d5, d1[1]\n"
        "vmlal.u16 q10, d5, d1[2]\n"
        "vmlal.u16 q11, d5, d1[3]\n"
        "vmlal.u16 q12, d7, d1[0]\n"
        "vmlal.u16 q13, d7, d1[1]\n"
        "vmlal.u16 q14, d7, d1[2]\n"
        "vmlal.u16 q15, d7, d1[3]\n"

        // Store accumulators
        "mov r0, %[accum_ptr]\n"
        "vst1.32 {d8, d9},   [r0]!\n"
        "vst1.32 {d16, d17}, [r0]!\n"
        "vst1.32 {d24, d25}, [r0]!\n"
        "vst1.32 {d10, d11}, [r0]!\n"
        "vst1.32 {d18, d19}, [r0]!\n"
        "vst1.32 {d26, d27}, [r0]!\n"
        "vst1.32 {d12, d13}, [r0]!\n"
        "vst1.32 {d20, d21}, [r0]!\n"
        "vst1.32 {d28, d29}, [r0]!\n"
        "vst1.32 {d14, d15}, [r0]!\n"
        "vst1.32 {d22, d23}, [r0]!\n"
        "vst1.32 {d30, d31}, [r0]!\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [depth] "+r"(depth)
        :  // inputs
        [accum_ptr] "r"(accum_ptr)
        :  // clobbers
        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
        "d28", "d29", "d30", "d31");
  }
};

// This is Maciek Chociej's fast kernel not expanding operands,
// from gemmlowp/meta/. Search for
//      mul_3x8_3x8_int32_lhsadd_rhsadd
// in this file:
// https://raw.githubusercontent.com/google/gemmlowp/e4b9d858b6637d5d0058bfa3d869d2b95864251b/meta/single_thread_gemm.h
struct NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators_noexpand {
  typedef std::uint8_t OperandType;
  typedef std::uint32_t AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<3, 8, CellOrder::WidthMajor>, 1>,
      KernelSideFormat<CellFormat<3, 8, CellOrder::WidthMajor>, 1> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    asm volatile(
        // Clear aggregators.
        "vmov.i32 q0, #0\n"
        "vmov.i32 q1, #0\n"
        "vmov.i32 q2, #0\n"
        "vmov.i32 q3, q0\n"
        "vmov.i32 q4, q1\n"
        "vmov.i32 q5, q2\n"
        "vmov.i32 q6, q3\n"
        "vmov.i32 q7, q4\n"
        "vmov.i32 q8, q5\n"

        // Loop head
        GEMMLOWP_LABEL_LOOP
        ":\n"

        // Subtract counter.
        "subs %[depth], %[depth], #8\n"

        "vld1.8 {d18, d19, d20}, [%[rhs_ptr]]!\n"
        "vld1.8 {d21, d22, d23}, [%[lhs_ptr]]!\n"
        "vmull.u8 q12, d18, d21\n"
        "vmull.u8 q13, d18, d22\n"
        "vmull.u8 q14, d18, d23\n"
        "vmull.u8 q15, d19, d21\n"
        "vpadal.u16 q0, q12\n"
        "vpadal.u16 q1, q13\n"
        "vpadal.u16 q2, q14\n"
        "vpadal.u16 q3, q15\n"
        "vmull.u8 q12, d19, d22\n"
        "vmull.u8 q13, d19, d23\n"
        "vmull.u8 q14, d20, d21\n"
        "vmull.u8 q15, d20, d22\n"
        "vmull.u8 q9, d20, d23\n"
        "vpadal.u16 q4, q12\n"
        "vpadal.u16 q5, q13\n"
        "vpadal.u16 q6, q14\n"
        "vpadal.u16 q7, q15\n"
        "vpadal.u16 q8, q9\n"

        // Loop branch
        "bne " GEMMLOWP_LABEL_LOOP
        "b\n"

        // Horizontal reduce aggregators, step 1
        "vpadd.u32 d0, d0, d1\n"
        "vpadd.u32 d2, d2, d3\n"
        "vpadd.u32 d4, d4, d5\n"
        "vpadd.u32 d6, d6, d7\n"
        "vpadd.u32 d8, d8, d9\n"
        "vpadd.u32 d10, d10, d11\n"
        "vpadd.u32 d12, d12, d13\n"
        "vpadd.u32 d14, d14, d15\n"
        "vpadd.u32 d16, d16, d17\n"

        // Horizontal reduce aggregators, step 2
        "vpadd.u32 d0, d0, d2\n"
        "vpadd.u32 d1, d4, d4\n"
        "vpadd.u32 d6, d6, d8\n"
        "vpadd.u32 d7, d10, d10\n"
        "vpadd.u32 d12, d12, d14\n"
        "vpadd.u32 d13, d16, d16\n"

        // Load accumulators
        "mov r0, %[accum_ptr]\n"
        "vld1.32 {d2}, [r0]!\n"
        "vld1.32 {d3[0]}, [r0]!\n"

        "vld1.32 {d8}, [r0]!\n"
        "vld1.32 {d9[0]}, [r0]!\n"

        "vld1.32 {d14}, [r0]!\n"
        "vld1.32 {d15[0]}, [r0]!\n"

        // Accumulate
        "vadd.s32 q0, q0, q1\n"
        "vadd.s32 q3, q3, q4\n"
        "vadd.s32 q6, q6, q7\n"

        // Store accumulators
        "mov r0, %[accum_ptr]\n"
        "vst1.32 {d0}, [r0]!\n"
        "vst1.32 {d1[0]}, [r0]!\n"

        "vst1.32 {d6}, [r0]!\n"
        "vst1.32 {d7[0]}, [r0]!\n"

        "vst1.32 {d12}, [r0]!\n"
        "vst1.32 {d13[0]}, [r0]!\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [depth] "+r"(depth)
        :  // inputs
        [accum_ptr] "r"(accum_ptr)
        :  // clobbers
        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
        "d28", "d29", "d30", "d31");
  }
};

// Fast kernel operating on int8 operands.
// It is assumed that one of the two int8 operands only takes values
// in [-127, 127], while the other may freely range in [-128, 127].
// The issue with both operands taking the value -128 is that:
// -128*-128 + -128*-128 == -32768 overflows int16.
// Every other expression a*b + c*d, for any int8 a,b,c,d, fits in int16
// range. That is the basic idea of this kernel.
struct NEON_32bit_GEMM_Int8Operands_AccumTwoWithin16Bits {
  typedef std::int8_t OperandType;
  typedef std::int32_t AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
      KernelSideFormat<CellFormat<2, 16, CellOrder::WidthMajor>, 1> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    std::size_t start_depth = 123;
    std::size_t run_depth = depth;
    std::size_t dst_col_stride = 4;
    AccumulatorType* dst_ptr = accum_ptr;
    asm volatile(

        // Overview of register layout:
        //
        // A 2x16 block of Rhs is stored in 8 bit in d0--d3.
        // A 4x16 block of Lhs is stored in 8 bit in d4--d7. That is only
        // half of the register space required, so we loop over these registers
        // twice. Only half of it, a 2x16 block, is stored in d4--d7 at
        // any given time.
        //
        // A 4x2 block of accumulators is stored in q8--q15 (as 4x32 bit
        // components which need to be horizontally-added at the end)
        //
        // The Lhs vectors are multiplied by the Rhs vectors with a widening
        // multiply over the 8 first levels of depth, producing int16x8
        // vectors of products for each position in the accumulator matrix.
        // Here comes the special trick: since the operands are signed int8,
        // their range being [ -2^7 , 2^7 ), their products are in range
        // [ -2^14 , 2^14 - 1 ), meaning that we can add two such values
        // without any risk of overflowing int16.
        // We thus proceed with the 8 next levels of depth, multiplying
        // again Lhs by Rhs, accumulating into this existing int16x8 vector.
        //
        // Only then, having processed 16 levels of depth, do we need to
        // horizontally add these int16x8 accumulators into the final
        // int32x4 accumulators.
        //
        // As we do not have enough registers to store all 16 int16x8
        // temporary-16bit-accumulators, we have them cycle through q4--q7.
        //
        //
        // Register layout (ignoring the q4--q7 temporary 16bit accumulators):
        //
        //                               +----+----+
        //                               | d0 | d2 |
        //                               | .  | .  |
        //                               | .  | .  |
        //                               | .  | .  |
        //                       Rhs     +----+----+
        //                               | d1 | d3 |
        //                               | .  | .  |
        //                               | .  | .  |
        //                               | .  | .  |
        //                               +----+----+
        //
        //                               |    |    |
        //
        //    Lhs                        |    |    |
        //
        //  +--------+--------+ - - - -  +----+----+
        //  | d4 ... | d5 ... |          | q8 | q9 |
        //  | d6 ... | d7 ... |          | q10| q11|
        //  | d4 ... | d5 ... |          | q12| q13|
        //  | d6 ... | d7 ... |          | q14| q15|
        //  +--------+--------+ - - - -  +----+----+
        //
        //                               Accumulator
        //

        // Clear accumulators, and, interleaved with it,
        // initial loads of the first loop iteration,
        // taken out of the loop so that in the loop itself we have
        // optimal streaming of data from memory.
        "vldr d0, [%[rhs_ptr], #0]\n"
        "vmov.i32 q8, #0\n"
        "vldr d4, [%[lhs_ptr], #0]\n"
        "vmov.i32 q9, #0\n"
        "vldr d2, [%[rhs_ptr], #16]\n"
        "vmov.i32 q10, q8\n"
        "vldr d6, [%[lhs_ptr], #16]\n"
        "vmov.i32 q11, q8\n"
        "vldr d1, [%[rhs_ptr], #8]\n"
        "vmov.i32 q12, q8\n"
        "vldr d5, [%[lhs_ptr], #8]\n"
        "vmov.i32 q13, q8\n"
        "vldr d3, [%[rhs_ptr], #24]\n"
        "vmov.i32 q14, q8\n"
        "vldr d7, [%[lhs_ptr], #24]\n"
        "vmov.i32 q15, q8\n"

        // General loop.
        GEMMLOWP_LABEL_LOOP
        ":\n"

        // Multiply 8 first levels of depth.
        "vmull.s8    q4,  d0,  d4\n"
        "add %[rhs_ptr], %[rhs_ptr], #32\n"
        "vmull.s8    q5,  d2,  d4\n"
        "vldr d4, [%[lhs_ptr], #32]\n"
        "vmull.s8    q6,  d0,  d6\n"
        "vmull.s8    q7,  d2,  d6\n"
        "vldr d6, [%[lhs_ptr], #48]\n"

        // Multiply-accumulate second-half, again into the same
        // 16bit local accumulator registers. This is where we
        // take advantage of having int8 instead of uint8 and therefore
        // being able to accumulate two products into int16.
        "vmlal.s8    q4,  d1,  d5\n"
        "vmlal.s8    q5,  d3,  d5\n"
        "vldr d5, [%[lhs_ptr], #40]\n"
        "vmlal.s8    q6,  d1,  d7\n"
        "vmlal.s8    q7,  d3,  d7\n"
        "vldr d7, [%[lhs_ptr], #56]\n"

        // Add pairwise, accumulate into 32-bit accumulators.
        "vpadal.s16   q8,  q4\n"
        "add %[lhs_ptr], %[lhs_ptr], #64\n"
        "vpadal.s16   q9,  q5\n"
        "subs %[run_depth], %[run_depth], #16\n"
        "vpadal.s16   q10, q6\n"
        "vpadal.s16   q11, q7\n"

        "beq " GEMMLOWP_LABEL_AFTER_LOOP
        "f\n"

        // Multiply first half.
        "vmull.s8    q4,  d0,  d4\n"
        "vmull.s8    q5,  d2,  d4\n"
        "vldr d4, [%[lhs_ptr], #0]\n"
        "vmull.s8    q6,  d0,  d6\n"
        "vldr d0, [%[rhs_ptr], #0]\n"
        "vmull.s8    q7,  d2,  d6\n"
        "vldr d2, [%[rhs_ptr], #16]\n"

        // Multiply-accumulate second-half, again into the same
        // 16bit local accumulator registers. This is where we
        // take advantage of having int8 instead of uint8 and therefore
        // being able to accumulate two products into int16.
        "vmlal.s8    q4,  d1,  d5\n"
        "vldr d6, [%[lhs_ptr], #16]\n"
        "vmlal.s8    q5,  d3,  d5\n"
        "vldr d5, [%[lhs_ptr], #8]\n"
        "vmlal.s8    q6,  d1,  d7\n"
        "vldr d1, [%[rhs_ptr], #8]\n"
        "vmlal.s8    q7,  d3,  d7\n"
        "vldr d3, [%[rhs_ptr], #24]\n"

        // Add pairwise, accumulate into 32-bit accumulators.
        "vpadal.s16   q12, q4\n"
        "vldr d7, [%[lhs_ptr], #24]\n"
        "vpadal.s16   q13, q5\n"
        "vpadal.s16   q14, q6\n"
        "vpadal.s16   q15, q7\n"

        "b " GEMMLOWP_LABEL_LOOP "b\n"

        GEMMLOWP_LABEL_AFTER_LOOP
        ":\n"

        // Multiply first half.
        "vmull.s8    q4,  d0,  d4\n"
        "vmull.s8    q5,  d2,  d4\n"
        "vmull.s8    q6,  d0,  d6\n"
        "vmull.s8    q7,  d2,  d6\n"

        // Multiply-accumulate second-half, again into the same
        // 16bit local accumulator registers. This is where we
        // take advantage of having int8 instead of uint8 and therefore
        // being able to accumulate two products into int16.
        "vmlal.s8    q4,  d1,  d5\n"
        "vmlal.s8    q5,  d3,  d5\n"
        "vmlal.s8    q6,  d1,  d7\n"
        "vmlal.s8    q7,  d3,  d7\n"

        // Add pairwise, accumulate into 32-bit accumulators.
        "vpadal.s16   q12, q4\n"
        "vpadal.s16   q13, q5\n"
        "vpadal.s16   q14, q6\n"
        "vpadal.s16   q15, q7\n"
        "cmp %[start_depth], #0\n"

        // Reduce 32bit accumulators horizontally.
        "vpadd.s32 d0, d16, d17\n"
        "vpadd.s32 d1, d18, d19\n"
        "vpadd.s32 d2, d20, d21\n"
        "vpadd.s32 d3, d22, d23\n"
        "vpadd.s32 d4, d24, d25\n"
        "vpadd.s32 d5, d26, d27\n"
        "vpadd.s32 d6, d28, d29\n"
        "vpadd.s32 d7, d30, d31\n"

        "bne " GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
        "f\n"

        // Reduce 32bit accumulators horizontally, second pass
        // (each pass adds pairwise. we need to add 4-wise).
        "vpadd.s32 d8, d0, d2\n"
        "vpadd.s32 d9, d4, d6\n"
        "vpadd.s32 d10, d1, d3\n"
        "vpadd.s32 d11, d5, d7\n"

        "b " GEMMLOWP_LABEL_STORE "f\n"

        GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
        ":\n"

        // Reduce 32bit accumulators horizontally, second pass
        // (each pass adds pairwise. we need to add 4-wise),
        // and load destination values from memory.
        "mov r0, %[dst_ptr]\n"
        "vld1.32 {d16, d17}, [r0]!\n"
        "vpadd.s32 d8, d0, d2\n"
        "vpadd.s32 d9, d4, d6\n"
        "vld1.32 {d18, d19}, [r0]\n"
        "vpadd.s32 d10, d1, d3\n"
        "vpadd.s32 d11, d5, d7\n"

        // Add horizontally-reduced accumulators into
        // the values loaded from memory
        "vadd.s32 q4, q8, q4\n"
        "vadd.s32 q5, q9, q5\n"

        GEMMLOWP_LABEL_STORE
        ":\n"
        // Store back into memory
        "mov r0, %[dst_ptr]\n"
        "vst1.32 {d8, d9}, [r0]!\n"
        "vst1.32 {d10, d11}, [r0]\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [dst_ptr] "+r"(dst_ptr), [run_depth] "+r"(run_depth)
        :  // inputs
        [start_depth] "r"(start_depth)
        :  // clobbers
        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
        "d28", "d29", "d30", "d31");
  }
};

// We don't actually use int32*int32 in production. This is just an
// experiment to help dissociate the effect of integer-vs-float, from the
// effect of operands width.
struct NEON_32bit_GEMM_Int32_WithScalar {
  typedef std::int32_t OperandType;
  typedef std::int32_t AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    asm volatile(
        // Load accumulators
        "mov r0, %[accum_ptr]\n"
        "vld1.32 {d8, d9},   [r0]!\n"
        "vld1.32 {d16, d17}, [r0]!\n"
        "vld1.32 {d24, d25}, [r0]!\n"
        "vld1.32 {d10, d11}, [r0]!\n"
        "vld1.32 {d18, d19}, [r0]!\n"
        "vld1.32 {d26, d27}, [r0]!\n"
        "vld1.32 {d12, d13}, [r0]!\n"
        "vld1.32 {d20, d21}, [r0]!\n"
        "vld1.32 {d28, d29}, [r0]!\n"
        "vld1.32 {d14, d15}, [r0]!\n"
        "vld1.32 {d22, d23}, [r0]!\n"
        "vld1.32 {d30, d31}, [r0]!\n"

        GEMMLOWP_LABEL_LOOP
        ":\n"

        // Load 1 Rhs cell of size 1x4
        "vld1.32 {d0, d1}, [%[rhs_ptr]]!\n"

        // Load 3 Lhs cells of size 4x1 each
        "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
        "vld1.32 {d4, d5}, [%[lhs_ptr]]!\n"
        "vld1.32 {d6, d7}, [%[lhs_ptr]]!\n"

        // Multiply-accumulate
        "vmla.s32 q4, q1, d0[0]\n"
        "vmla.s32 q5, q1, d0[1]\n"
        "vmla.s32 q6, q1, d1[0]\n"
        "vmla.s32 q7, q1, d1[1]\n"
        "vmla.s32 q8, q2, d0[0]\n"
        "vmla.s32 q9, q2, d0[1]\n"
        "vmla.s32 q10, q2, d1[0]\n"
        "vmla.s32 q11, q2, d1[1]\n"
        "vmla.s32 q12, q3, d0[0]\n"
        "vmla.s32 q13, q3, d0[1]\n"
        "vmla.s32 q14, q3, d1[0]\n"
        "vmla.s32 q15, q3, d1[1]\n"

        // Loop. Decrement loop index (depth) by 1, since we just handled 1
        // level of depth.
        "subs %[depth], #1\n"
        "bne " GEMMLOWP_LABEL_LOOP
        "b\n"

        // Store accumulators
        "mov r0, %[accum_ptr]\n"
        "vst1.32 {d8, d9},   [r0]!\n"
        "vst1.32 {d16, d17}, [r0]!\n"
        "vst1.32 {d24, d25}, [r0]!\n"
        "vst1.32 {d10, d11}, [r0]!\n"
        "vst1.32 {d18, d19}, [r0]!\n"
        "vst1.32 {d26, d27}, [r0]!\n"
        "vst1.32 {d12, d13}, [r0]!\n"
        "vst1.32 {d20, d21}, [r0]!\n"
        "vst1.32 {d28, d29}, [r0]!\n"
        "vst1.32 {d14, d15}, [r0]!\n"
        "vst1.32 {d22, d23}, [r0]!\n"
        "vst1.32 {d30, d31}, [r0]!\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [depth] "+r"(depth)
        :  // inputs
        [accum_ptr] "r"(accum_ptr)
        :  // clobbers
        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
        "d28", "d29", "d30", "d31");
  }
};

// Not very efficient kernel, just an experiment to see what we can do
// without using NEON multiply-with-scalar instructions.
struct NEON_32bit_GEMM_Float32_MLA_WithVectorDuplicatingScalar {
  typedef float OperandType;
  typedef float AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    asm volatile(
        // Load accumulators
        "mov r0, %[accum_ptr]\n"
        "vld1.32 {d8, d9},   [r0]!\n"
        "vld1.32 {d16, d17}, [r0]!\n"
        "vld1.32 {d24, d25}, [r0]!\n"
        "vld1.32 {d10, d11}, [r0]!\n"
        "vld1.32 {d18, d19}, [r0]!\n"
        "vld1.32 {d26, d27}, [r0]!\n"
        "vld1.32 {d12, d13}, [r0]!\n"
        "vld1.32 {d20, d21}, [r0]!\n"
        "vld1.32 {d28, d29}, [r0]!\n"
        "vld1.32 {d14, d15}, [r0]!\n"
        "vld1.32 {d22, d23}, [r0]!\n"
        "vld1.32 {d30, d31}, [r0]!\n"

        GEMMLOWP_LABEL_LOOP
        ":\n"

        // Load 3 Lhs cells of size 4x1 each
        "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
        "vld1.32 {d4, d5}, [%[lhs_ptr]]!\n"
        "vld1.32 {d6, d7}, [%[lhs_ptr]]!\n"

        // Multiply-accumulate
        "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
        "vmla.f32 q4, q1, q0\n"
        "vmla.f32 q8, q2, q0\n"
        "vmla.f32 q12, q3, q0\n"
        "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
        "vmla.f32 q5, q1, q0\n"
        "vmla.f32 q9, q2, q0\n"
        "vmla.f32 q13, q3, q0\n"
        "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
        "vmla.f32 q6, q1, q0\n"
        "vmla.f32 q10, q2, q0\n"
        "vmla.f32 q14, q3, q0\n"
        "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
        "vmla.f32 q7, q1, q0\n"
        "vmla.f32 q11, q2, q0\n"
        "vmla.f32 q15, q3, q0\n"

        // Loop. Decrement loop index (depth) by 1, since we just handled 1
        // level of depth.
        "subs %[depth], #1\n"
        "bne " GEMMLOWP_LABEL_LOOP
        "b\n"

        // Store accumulators
        "mov r0, %[accum_ptr]\n"
        "vst1.32 {d8, d9},   [r0]!\n"
        "vst1.32 {d16, d17}, [r0]!\n"
        "vst1.32 {d24, d25}, [r0]!\n"
        "vst1.32 {d10, d11}, [r0]!\n"
        "vst1.32 {d18, d19}, [r0]!\n"
        "vst1.32 {d26, d27}, [r0]!\n"
        "vst1.32 {d12, d13}, [r0]!\n"
        "vst1.32 {d20, d21}, [r0]!\n"
        "vst1.32 {d28, d29}, [r0]!\n"
        "vst1.32 {d14, d15}, [r0]!\n"
        "vst1.32 {d22, d23}, [r0]!\n"
        "vst1.32 {d30, d31}, [r0]!\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [depth] "+r"(depth)
        :  // inputs
        [accum_ptr] "r"(accum_ptr)
        :  // clobbers
        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
        "d28", "d29", "d30", "d31");
  }
};

// Not very efficient kernel, just an experiment to see what we can do
// without using NEON multiply-with-scalar instructions.
// This variant is relevant as on ARMv7 FMA does not have a with-scalar variant.
struct NEON_32bit_GEMM_Float32_FMA_WithVectorDuplicatingScalar {
  typedef float OperandType;
  typedef float AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    asm volatile(
        // Load accumulators
        "mov r0, %[accum_ptr]\n"
        "vld1.32 {d8, d9},   [r0]!\n"
        "vld1.32 {d16, d17}, [r0]!\n"
        "vld1.32 {d24, d25}, [r0]!\n"
        "vld1.32 {d10, d11}, [r0]!\n"
        "vld1.32 {d18, d19}, [r0]!\n"
        "vld1.32 {d26, d27}, [r0]!\n"
        "vld1.32 {d12, d13}, [r0]!\n"
        "vld1.32 {d20, d21}, [r0]!\n"
        "vld1.32 {d28, d29}, [r0]!\n"
        "vld1.32 {d14, d15}, [r0]!\n"
        "vld1.32 {d22, d23}, [r0]!\n"
        "vld1.32 {d30, d31}, [r0]!\n"

        GEMMLOWP_LABEL_LOOP
        ":\n"

        // Load 3 Lhs cells of size 4x1 each
        "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
        "vld1.32 {d4, d5}, [%[lhs_ptr]]!\n"
        "vld1.32 {d6, d7}, [%[lhs_ptr]]!\n"

        // Multiply-accumulate
        "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
        "vfma.f32 q4, q1, q0\n"
        "vfma.f32 q8, q2, q0\n"
        "vfma.f32 q12, q3, q0\n"
        "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
        "vfma.f32 q5, q1, q0\n"
        "vfma.f32 q9, q2, q0\n"
        "vfma.f32 q13, q3, q0\n"
        "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
        "vfma.f32 q6, q1, q0\n"
        "vfma.f32 q10, q2, q0\n"
        "vfma.f32 q14, q3, q0\n"
        "vld1.32 {d0[], d1[]}, [%[rhs_ptr]]!\n"
        "vfma.f32 q7, q1, q0\n"
        "vfma.f32 q11, q2, q0\n"
        "vfma.f32 q15, q3, q0\n"

        // Loop. Decrement loop index (depth) by 1, since we just handled 1
        // level of depth.
        "subs %[depth], #1\n"
        "bne " GEMMLOWP_LABEL_LOOP
        "b\n"

        // Store accumulators
        "mov r0, %[accum_ptr]\n"
        "vst1.32 {d8, d9},   [r0]!\n"
        "vst1.32 {d16, d17}, [r0]!\n"
        "vst1.32 {d24, d25}, [r0]!\n"
        "vst1.32 {d10, d11}, [r0]!\n"
        "vst1.32 {d18, d19}, [r0]!\n"
        "vst1.32 {d26, d27}, [r0]!\n"
        "vst1.32 {d12, d13}, [r0]!\n"
        "vst1.32 {d20, d21}, [r0]!\n"
        "vst1.32 {d28, d29}, [r0]!\n"
        "vst1.32 {d14, d15}, [r0]!\n"
        "vst1.32 {d22, d23}, [r0]!\n"
        "vst1.32 {d30, d31}, [r0]!\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [depth] "+r"(depth)
        :  // inputs
        [accum_ptr] "r"(accum_ptr)
        :  // clobbers
        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
        "d28", "d29", "d30", "d31");
  }
};

// This is the "most natural" kernel, using NEON multiply-with-scalar
// instructions.
struct NEON_32bit_GEMM_Float32_MLA_WithScalar {
  typedef float OperandType;
  typedef float AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    asm volatile(
        // Load accumulators
        "mov r0, %[accum_ptr]\n"
        "vld1.32 {d8, d9},   [r0]!\n"
        "vld1.32 {d16, d17}, [r0]!\n"
        "vld1.32 {d24, d25}, [r0]!\n"
        "vld1.32 {d10, d11}, [r0]!\n"
        "vld1.32 {d18, d19}, [r0]!\n"
        "vld1.32 {d26, d27}, [r0]!\n"
        "vld1.32 {d12, d13}, [r0]!\n"
        "vld1.32 {d20, d21}, [r0]!\n"
        "vld1.32 {d28, d29}, [r0]!\n"
        "vld1.32 {d14, d15}, [r0]!\n"
        "vld1.32 {d22, d23}, [r0]!\n"
        "vld1.32 {d30, d31}, [r0]!\n"

        GEMMLOWP_LABEL_LOOP
        ":\n"

        // Load 1 Rhs cell of size 1x4
        "vld1.32 {d0, d1}, [%[rhs_ptr]]!\n"

        // Load 3 Lhs cells of size 4x1 each
        "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
        "vld1.32 {d4, d5}, [%[lhs_ptr]]!\n"
        "vld1.32 {d6, d7}, [%[lhs_ptr]]!\n"

        // Multiply-accumulate
        "vmla.f32 q4, q1, d0[0]\n"
        "vmla.f32 q5, q1, d0[1]\n"
        "vmla.f32 q6, q1, d1[0]\n"
        "vmla.f32 q7, q1, d1[1]\n"
        "vmla.f32 q8, q2, d0[0]\n"
        "vmla.f32 q9, q2, d0[1]\n"
        "vmla.f32 q10, q2, d1[0]\n"
        "vmla.f32 q11, q2, d1[1]\n"
        "vmla.f32 q12, q3, d0[0]\n"
        "vmla.f32 q13, q3, d0[1]\n"
        "vmla.f32 q14, q3, d1[0]\n"
        "vmla.f32 q15, q3, d1[1]\n"

        // Loop. Decrement loop index (depth) by 1, since we just handled 1
        // level of depth.
        "subs %[depth], #1\n"
        "bne " GEMMLOWP_LABEL_LOOP
        "b\n"

        // Store accumulators
        "mov r0, %[accum_ptr]\n"
        "vst1.32 {d8, d9},   [r0]!\n"
        "vst1.32 {d16, d17}, [r0]!\n"
        "vst1.32 {d24, d25}, [r0]!\n"
        "vst1.32 {d10, d11}, [r0]!\n"
        "vst1.32 {d18, d19}, [r0]!\n"
        "vst1.32 {d26, d27}, [r0]!\n"
        "vst1.32 {d12, d13}, [r0]!\n"
        "vst1.32 {d20, d21}, [r0]!\n"
        "vst1.32 {d28, d29}, [r0]!\n"
        "vst1.32 {d14, d15}, [r0]!\n"
        "vst1.32 {d22, d23}, [r0]!\n"
        "vst1.32 {d30, d31}, [r0]!\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [depth] "+r"(depth)
        :  // inputs
        [accum_ptr] "r"(accum_ptr)
        :  // clobbers
        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
        "d28", "d29", "d30", "d31");
  }
};

// Faster kernel contributed by ARM in 64bit form
// (see NEON_64bit_GEMM_Float32_WithScalar_A53) then ported to 32bit code.
// Tuned for A53.
struct NEON_32bit_GEMM_Float32_WithScalar_A53 {
  typedef float OperandType;
  typedef float AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    asm volatile(
        // Load accumulators
        "mov r0, %[accum_ptr]\n"
        "vld1.32 {d8, d9},   [r0]!\n"
        "vld1.32 {d16, d17}, [r0]!\n"
        "vld1.32 {d24, d25}, [r0]!\n"
        "vld1.32 {d10, d11}, [r0]!\n"
        "vld1.32 {d18, d19}, [r0]!\n"
        "vld1.32 {d26, d27}, [r0]!\n"
        "vld1.32 {d12, d13}, [r0]!\n"
        "vld1.32 {d20, d21}, [r0]!\n"
        "vld1.32 {d28, d29}, [r0]!\n"
        "vld1.32 {d14, d15}, [r0]!\n"
        "vld1.32 {d22, d23}, [r0]!\n"
        "vld1.32 {d30, d31}, [r0]!\n"

        // Overview of register layout:
        //
        // A 1x4 cell of Rhs is stored in d0--d1 (q0).
        // A 12x1 block of 3 4x1 cells Lhs is stored in d2--d7
        // (q1--q3).
        // A 12x4 block of accumulators is stored in q4--q15.
        //
        //                   +-----+-----+-----+-----+
        //             Rhs   |d0[0]|d0[1]|d1[0]|d1[1]|
        //                   +-----+-----+-----+-----+
        //
        //                   |     |     |     |     |
        //
        //  Lhs              |     |     |     |     |
        //
        //  +--+- - - - - -  +-----+-----+-----+-----+
        //  |d2|             | q4  | q5  | q6  | q7  |
        //  |d2|             | q4  | q5  | q6  | q7  |
        //  |d3|             | q4  | q5  | q6  | q7  |
        //  |d3|             | q4  | q5  | q6  | q7  |
        //  +--+- - - - - -  +-----+-----+-----+-----+
        //  |d4|             | q8  | q9  | q10 | q11 |
        //  |d4|             | q8  | q9  | q10 | q11 |
        //  |d5|             | q8  | q9  | q10 | q11 |
        //  |d5|             | q8  | q9  | q10 | q11 |
        //  +--+ - - - - - - +-----+-----+-----+-----+
        //  |d6|             | q12 | q13 | q14 | q15 |
        //  |d6|             | q12 | q13 | q14 | q15 |
        //  |d7|             | q12 | q13 | q14 | q15 |
        //  |d7|             | q12 | q13 | q14 | q15 |
        //  +--+- - - - - -  +-----+-----+-----+-----+
        //
        //                            Accumulator

        // Load Rhs cell
        "vldr d0, [%[rhs_ptr]]\n"
        "ldr r2, [%[rhs_ptr], #8]\n"
        "ldr r3, [%[rhs_ptr], #12]\n"

        // Load 1st Lhs Cell
        "vld1.32 {d2, d3}, [%[lhs_ptr]]\n"

        GEMMLOWP_LABEL_LOOP
        ":\n"

        "vldr d4, [%[lhs_ptr], #16]\n"  // Load 1st half of 2nd Lhs cell
        "vmov d1, r2, r3\n"             // Prepare 2nd half of Rhs cell
        "vmla.f32 q4, q1, d0[0]\n"      // Multiply 1st Lhs cell with column 0
        "ldr r2, [%[lhs_ptr], #24]\n"   // Load 2nd half of 2nd Lhs cell, part 1
        "vmla.f32 q5, q1, d0[1]\n"      // Multiply 1st Lhs cell with column 1
        "ldr r3, [%[lhs_ptr], #28]\n"   // Load 2nd half of 2nd Lhs cell, part 2
        "vmla.f32 q6, q1, d1[0]\n"      // Multiply 1st Lhs cell with column 2
        "subs %[depth], #1\n"

        "vldr d6, [%[lhs_ptr], #32]\n"  // Load 1st half of 3rd Lhs cell
        "vmov d5, r2, r3\n"             // Prepare 2nd half of 2nd Lhs cell
        "vmla.f32 q7, q1, d1[1]\n"      // Multiply 1st Lhs cell with column 3
        "ldr r2, [%[lhs_ptr], #40]\n"   // Load 2nd half of 3rd Lhs cell, part 1
        "vmla.f32 q8, q2, d0[0]\n"      // Multiply 2nd Lhs cell with column 0
        "ldr r3, [%[lhs_ptr], #44]\n"   // Load 2nd half of 3rd Lhs cell, part 2
        "vmla.f32 q9, q2, d0[1]\n"      // Multiply 2nd Lhs cell with column 1
        "add %[rhs_ptr], %[rhs_ptr], #16\n"  // Move forward by 1 Rhs cell

        "vldr d2, [%[lhs_ptr], #48]\n"  // Load 1st half of 1st Lhs cell of next
        // iteration
        "vmov d7, r2, r3\n"            // Prepare 2nd half of 3rd Lhs cell
        "vmla.f32 q10, q2, d1[0]\n"    // Multiply 2nd Lhs cell with column 2
        "ldr r2, [%[lhs_ptr], #56]\n"  // Load 2nd half of 1st Lhs cell of next
        // iter, part 1
        "vmla.f32 q12, q3, d0[0]\n"    // Multiply 3rd Lhs cell with column 0
        "ldr r3, [%[lhs_ptr], #60]\n"  // Load 2nd half of 1st Lhs cell of next
        // iter, part 2
        "vmla.f32 q13, q3, d0[1]\n"  // Multiply 3rd Lhs cell with column 1
        "add %[lhs_ptr], %[lhs_ptr], #48\n"  // Move forward by 3 Lhs cells

        "vldr d0, [%[rhs_ptr]]\n"  // Load 1st half of Rhs cell of next
        // iteration
        "vmov d3, r2, r3\n"  // Prepare 2nd half of 1st Lhs cell of next
        // iteration
        "vmla.f32 q11, q2, d1[1]\n"   // Multiply 2nd Lhs cell with column 3
        "ldr r2, [%[rhs_ptr], #8]\n"  // Load 2nd half of Rhs cell of next
        // iteration, part 1
        "vmla.f32 q14, q3, d1[0]\n"    // Multiply 3rd Lhs cell with column 2
        "ldr r3, [%[rhs_ptr], #12]\n"  // Load 2nd half of Rhs cell of next
        // iteration, part 2
        "vmla.f32 q15, q3, d1[1]\n"  // Multiply 3rd Lhs cell with column 3

        // Loop branch.  This will dual issue in fmla cycle 3 of the 4th block.
        "bne " GEMMLOWP_LABEL_LOOP
        "b\n"

        // Store accumulators
        "mov r0, %[accum_ptr]\n"
        "vst1.32 {d8, d9},   [r0]!\n"
        "vst1.32 {d16, d17}, [r0]!\n"
        "vst1.32 {d24, d25}, [r0]!\n"
        "vst1.32 {d10, d11}, [r0]!\n"
        "vst1.32 {d18, d19}, [r0]!\n"
        "vst1.32 {d26, d27}, [r0]!\n"
        "vst1.32 {d12, d13}, [r0]!\n"
        "vst1.32 {d20, d21}, [r0]!\n"
        "vst1.32 {d28, d29}, [r0]!\n"
        "vst1.32 {d14, d15}, [r0]!\n"
        "vst1.32 {d22, d23}, [r0]!\n"
        "vst1.32 {d30, d31}, [r0]!\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [depth] "+r"(depth)
        :  // inputs
        [accum_ptr] "r"(accum_ptr)
        :  // clobbers
        "cc", "memory", "r0", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d5",
        "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16",
        "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26",
        "d27", "d28", "d29", "d30", "d31");
  }
};

struct NEON_32bit_GEMM_Float32_WithScalar_A53_depth2 {
  typedef float OperandType;
  typedef float AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>,
      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 1> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    asm volatile(
        // Load accumulators
        "mov r0, %[accum_ptr]\n"
        "vld1.32 {d8, d9},   [r0]!\n"
        "vld1.32 {d16, d17}, [r0]!\n"
        "vld1.32 {d24, d25}, [r0]!\n"
        "vld1.32 {d10, d11}, [r0]!\n"
        "vld1.32 {d18, d19}, [r0]!\n"
        "vld1.32 {d26, d27}, [r0]!\n"
        "vld1.32 {d12, d13}, [r0]!\n"
        "vld1.32 {d20, d21}, [r0]!\n"
        "vld1.32 {d28, d29}, [r0]!\n"
        "vld1.32 {d14, d15}, [r0]!\n"
        "vld1.32 {d22, d23}, [r0]!\n"
        "vld1.32 {d30, d31}, [r0]!\n"

        // Overview of register layout:
        //
        // A 1x4 cell of Rhs is stored in d0--d1 (q0).
        // A 12x1 block of 3 4x1 cells Lhs is stored in d2--d7
        // (q1--q3).
        // A 12x4 block of accumulators is stored in q4--q15.
        //
        //                   +-----+-----+-----+-----+
        //             Rhs   |d0[0]|d0[1]|d1[0]|d1[1]|
        //                   +-----+-----+-----+-----+
        //
        //                   |     |     |     |     |
        //
        //  Lhs              |     |     |     |     |
        //
        //  +--+- - - - - -  +-----+-----+-----+-----+
        //  |d2|             | q4  | q5  | q6  | q7  |
        //  |d2|             | q4  | q5  | q6  | q7  |
        //  |d3|             | q4  | q5  | q6  | q7  |
        //  |d3|             | q4  | q5  | q6  | q7  |
        //  +--+- - - - - -  +-----+-----+-----+-----+
        //  |d4|             | q8  | q9  | q10 | q11 |
        //  |d4|             | q8  | q9  | q10 | q11 |
        //  |d5|             | q8  | q9  | q10 | q11 |
        //  |d5|             | q8  | q9  | q10 | q11 |
        //  +--+ - - - - - - +-----+-----+-----+-----+
        //  |d6|             | q12 | q13 | q14 | q15 |
        //  |d6|             | q12 | q13 | q14 | q15 |
        //  |d7|             | q12 | q13 | q14 | q15 |
        //  |d7|             | q12 | q13 | q14 | q15 |
        //  +--+- - - - - -  +-----+-----+-----+-----+
        //
        //                            Accumulator

        // Load Rhs cell
        "vldr d0, [%[rhs_ptr]]\n"
        "ldr r2, [%[rhs_ptr], #8]\n"
        "ldr r3, [%[rhs_ptr], #12]\n"

        // Load 1st Lhs Cell
        "vld1.32 {d2, d3}, [%[lhs_ptr]]\n"

        // Loop head - handling 2 levels of depth at once
        GEMMLOWP_LABEL_LOOP
        ":\n"

        // Level of depth 1

        "vldr d4, [%[lhs_ptr], #32]\n"  // Load 1st half of 2nd Lhs cell
        "vmov d1, r2, r3\n"             // Prepare 2nd half of Rhs cell
        "vmla.f32 q4, q1, d0[0]\n"      // Multiply 1st Lhs cell with column 0
        "ldr r2, [%[lhs_ptr], #40]\n"   // Load 2nd half of 2nd Lhs cell, part 1
        "vmla.f32 q5, q1, d0[1]\n"      // Multiply 1st Lhs cell with column 1
        "ldr r3, [%[lhs_ptr], #44]\n"   // Load 2nd half of 2nd Lhs cell, part 2
        "vmla.f32 q6, q1, d1[0]\n"      // Multiply 1st Lhs cell with column 2

        "vldr d6, [%[lhs_ptr], #64]\n"  // Load 1st half of 3rd Lhs cell
        "vmov d5, r2, r3\n"             // Prepare 2nd half of 2nd Lhs cell
        "vmla.f32 q7, q1, d1[1]\n"      // Multiply 1st Lhs cell with column 3
        "ldr r2, [%[lhs_ptr], #72]\n"   // Load 2nd half of 3rd Lhs cell, part 1
        "vmla.f32 q8, q2, d0[0]\n"      // Multiply 2nd Lhs cell with column 0
        "ldr r3, [%[lhs_ptr], #76]\n"   // Load 2nd half of 3rd Lhs cell, part 2
        "vmla.f32 q9, q2, d0[1]\n"      // Multiply 2nd Lhs cell with column 1

        "vldr d2, [%[lhs_ptr], #16]\n"  // Load 1st half of 1st Lhs cell of next
        // iteration
        "vmov d7, r2, r3\n"            // Prepare 2nd half of 3rd Lhs cell
        "vmla.f32 q10, q2, d1[0]\n"    // Multiply 2nd Lhs cell with column 2
        "ldr r2, [%[lhs_ptr], #24]\n"  // Load 2nd half of 1st Lhs cell of next
        // iter, part 1
        "vmla.f32 q12, q3, d0[0]\n"    // Multiply 3rd Lhs cell with column 0
        "ldr r3, [%[lhs_ptr], #28]\n"  // Load 2nd half of 1st Lhs cell of next
        // iter, part 2
        "vmla.f32 q13, q3, d0[1]\n"  // Multiply 3rd Lhs cell with column 1

        "vldr d0, [%[rhs_ptr], #16]\n"  // Load 1st half of Rhs cell of next
        // iteration
        "vmov d3, r2, r3\n"  // Prepare 2nd half of 1st Lhs cell of next
        // iteration
        "vmla.f32 q11, q2, d1[1]\n"    // Multiply 2nd Lhs cell with column 3
        "ldr r2, [%[rhs_ptr], #24]\n"  // Load 2nd half of Rhs cell of next
        // iteration, part 1
        "vmla.f32 q14, q3, d1[0]\n"    // Multiply 3rd Lhs cell with column 2
        "ldr r3, [%[rhs_ptr], #28]\n"  // Load 2nd half of Rhs cell of next
        // iteration, part 2
        "vmla.f32 q15, q3, d1[1]\n"  // Multiply 3rd Lhs cell with column 3

        // Level of depth 2
        "vldr d4, [%[lhs_ptr], #48]\n"  // Load 1st half of 2nd Lhs cell
        "vmov d1, r2, r3\n"             // Prepare 2nd half of Rhs cell
        "vmla.f32 q4, q1, d0[0]\n"      // Multiply 1st Lhs cell with column 0
        "ldr r2, [%[lhs_ptr], #56]\n"   // Load 2nd half of 2nd Lhs cell, part 1
        "vmla.f32 q5, q1, d0[1]\n"      // Multiply 1st Lhs cell with column 1
        "ldr r3, [%[lhs_ptr], #60]\n"   // Load 2nd half of 2nd Lhs cell, part 2
        "vmla.f32 q6, q1, d1[0]\n"      // Multiply 1st Lhs cell with column 2
        "subs %[depth], #2\n"           // Decrement depth counter

        "vldr d6, [%[lhs_ptr], #80]\n"  // Load 1st half of 3rd Lhs cell
        "vmov d5, r2, r3\n"             // Prepare 2nd half of 2nd Lhs cell
        "vmla.f32 q7, q1, d1[1]\n"      // Multiply 1st Lhs cell with column 3
        "ldr r2, [%[lhs_ptr], #88]\n"   // Load 2nd half of 3rd Lhs cell, part 1
        "vmla.f32 q8, q2, d0[0]\n"      // Multiply 2nd Lhs cell with column 0
        "ldr r3, [%[lhs_ptr], #92]\n"   // Load 2nd half of 3rd Lhs cell, part 2
        "vmla.f32 q9, q2, d0[1]\n"      // Multiply 2nd Lhs cell with column 1
        "add %[rhs_ptr], %[rhs_ptr], #32\n"  // Move forward by 1 Rhs cell

        "vldr d2, [%[lhs_ptr], #96]\n"  // Load 1st half of 1st Lhs cell of next
        // iteration
        "vmov d7, r2, r3\n"             // Prepare 2nd half of 3rd Lhs cell
        "vmla.f32 q10, q2, d1[0]\n"     // Multiply 2nd Lhs cell with column 2
        "ldr r2, [%[lhs_ptr], #104]\n"  // Load 2nd half of 1st Lhs cell of next
        // iter, part 1
        "vmla.f32 q12, q3, d0[0]\n"     // Multiply 3rd Lhs cell with column 0
        "ldr r3, [%[lhs_ptr], #108]\n"  // Load 2nd half of 1st Lhs cell of next
        // iter, part 2
        "vmla.f32 q13, q3, d0[1]\n"  // Multiply 3rd Lhs cell with column 1
        "add %[lhs_ptr], %[lhs_ptr], #96\n"  // Move forward by 3 Lhs cells

        "vldr d0, [%[rhs_ptr]]\n"  // Load 1st half of Rhs cell of next
        // iteration
        "vmov d3, r2, r3\n"  // Prepare 2nd half of 1st Lhs cell of next
        // iteration
        "vmla.f32 q11, q2, d1[1]\n"   // Multiply 2nd Lhs cell with column 3
        "ldr r2, [%[rhs_ptr], #8]\n"  // Load 2nd half of Rhs cell of next
        // iteration, part 1
        "vmla.f32 q14, q3, d1[0]\n"    // Multiply 3rd Lhs cell with column 2
        "ldr r3, [%[rhs_ptr], #12]\n"  // Load 2nd half of Rhs cell of next
        // iteration, part 2
        "vmla.f32 q15, q3, d1[1]\n"  // Multiply 3rd Lhs cell with column 3

        // Loop branch.  This will dual issue in fmla cycle 3 of the 4th block.
        //"bne loop_%=\n"
        "bne " GEMMLOWP_LABEL_LOOP
        "b\n"

        // Store accumulators
        "mov r0, %[accum_ptr]\n"
        "vst1.32 {d8, d9},   [r0]!\n"
        "vst1.32 {d16, d17}, [r0]!\n"
        "vst1.32 {d24, d25}, [r0]!\n"
        "vst1.32 {d10, d11}, [r0]!\n"
        "vst1.32 {d18, d19}, [r0]!\n"
        "vst1.32 {d26, d27}, [r0]!\n"
        "vst1.32 {d12, d13}, [r0]!\n"
        "vst1.32 {d20, d21}, [r0]!\n"
        "vst1.32 {d28, d29}, [r0]!\n"
        "vst1.32 {d14, d15}, [r0]!\n"
        "vst1.32 {d22, d23}, [r0]!\n"
        "vst1.32 {d30, d31}, [r0]!\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [depth] "+r"(depth)
        :  // inputs
        [accum_ptr] "r"(accum_ptr)
        :  // clobbers
        "cc", "memory", "r0", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d5",
        "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16",
        "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26",
        "d27", "d28", "d29", "d30", "d31");
  }
};

// This rotating variant performs well when permutations (vext) can be
// dual-issued with arithmetic instructions.
struct NEON_32bit_GEMM_Float32_MLA_Rotating {
  typedef float OperandType;
  typedef float AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    asm volatile(
        // Load accumulators
        "mov r0, %[accum_ptr]\n"
        "vld1.32 {d8, d9},   [r0]!\n"
        "vld1.32 {d16, d17}, [r0]!\n"
        "vld1.32 {d24, d25}, [r0]!\n"
        "vld1.32 {d10, d11}, [r0]!\n"
        "vld1.32 {d18, d19}, [r0]!\n"
        "vld1.32 {d26, d27}, [r0]!\n"
        "vld1.32 {d12, d13}, [r0]!\n"
        "vld1.32 {d20, d21}, [r0]!\n"
        "vld1.32 {d28, d29}, [r0]!\n"
        "vld1.32 {d14, d15}, [r0]!\n"
        "vld1.32 {d22, d23}, [r0]!\n"
        "vld1.32 {d30, d31}, [r0]!\n"

#define NEON_32BIT_ROTATING_FLOAT_KERNEL_TRANSPOSE_ACCUMULATOR_CELLS \
  "vtrn.32 q4, q5\n"                                                 \
  "vtrn.32 q6, q7\n"                                                 \
  "vswp d9, d12\n"                                                   \
  "vswp d11, d14\n"                                                  \
  "vtrn.32 q8, q9\n"                                                 \
  "vtrn.32 q10, q11\n"                                               \
  "vswp d17, d20\n"                                                  \
  "vswp d19, d22\n"                                                  \
  "vtrn.32 q12, q13\n"                                               \
  "vtrn.32 q14, q15\n"                                               \
  "vswp d25, d28\n"                                                  \
  "vswp d27, d30\n"

#define NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(a, b, c) \
  NEON_32BIT_ROTATING_FLOAT_KERNEL_TRANSPOSE_ACCUMULATOR_CELLS             \
  "vext.32 q5, q5, q5, #" #a                                               \
  "\n"                                                                     \
  "vext.32 q6, q6, q6, #" #b                                               \
  "\n"                                                                     \
  "vext.32 q7, q7, q7, #" #c                                               \
  "\n"                                                                     \
  "vext.32 q9, q9, q9, #" #a                                               \
  "\n"                                                                     \
  "vext.32 q10, q10, q10, #" #b                                            \
  "\n"                                                                     \
  "vext.32 q11, q11, q11, #" #c                                            \
  "\n"                                                                     \
  "vext.32 q13, q13, q13, #" #a                                            \
  "\n"                                                                     \
  "vext.32 q14, q14, q14, #" #b                                            \
  "\n"                                                                     \
  "vext.32 q15, q15, q15, #" #c                                            \
  "\n" NEON_32BIT_ROTATING_FLOAT_KERNEL_TRANSPOSE_ACCUMULATOR_CELLS

        NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(1, 2, 3)

        //"loop_%=:\n"
        GEMMLOWP_LABEL_LOOP
        ":\n"

        // Load 1 Rhs cell of size 1x4
        "vld1.32 {d0, d1}, [%[rhs_ptr]]!\n"

        // Load 3 Lhs cells of size 4x1 each
        "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
        "vld1.32 {d4, d5}, [%[lhs_ptr]]!\n"
        "vld1.32 {d6, d7}, [%[lhs_ptr]]!\n"

        // Multiply-accumulate
        "vmla.f32 q4, q1, q0\n"
        "vmla.f32 q8, q2, q0\n"
        "vmla.f32 q12, q3, q0\n"
        "vext.f32 q0, q0, q0, #1\n"
        "vmla.f32 q5, q1, q0\n"
        "vmla.f32 q9, q2, q0\n"
        "vmla.f32 q13, q3, q0\n"
        "vext.f32 q0, q0, q0, #1\n"
        "vmla.f32 q6, q1, q0\n"
        "vmla.f32 q10, q2, q0\n"
        "vmla.f32 q14, q3, q0\n"
        "vext.f32 q0, q0, q0, #1\n"
        "vmla.f32 q7, q1, q0\n"
        "vmla.f32 q11, q2, q0\n"
        "vmla.f32 q15, q3, q0\n"

        // Loop. Decrement loop index (depth) by 1, since we just handled 1
        // level of depth.
        "subs %[depth], #1\n"
        //"bne loop_%=\n"
        "bne " GEMMLOWP_LABEL_LOOP
        "b\n"

        // Store accumulators
        "mov r0, %[accum_ptr]\n"

        NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(3, 2, 1)

            "vst1.32 {d8, d9},   [r0]!\n"
            "vst1.32 {d16, d17}, [r0]!\n"
            "vst1.32 {d24, d25}, [r0]!\n"
            "vst1.32 {d10, d11}, [r0]!\n"
            "vst1.32 {d18, d19}, [r0]!\n"
            "vst1.32 {d26, d27}, [r0]!\n"
            "vst1.32 {d12, d13}, [r0]!\n"
            "vst1.32 {d20, d21}, [r0]!\n"
            "vst1.32 {d28, d29}, [r0]!\n"
            "vst1.32 {d14, d15}, [r0]!\n"
            "vst1.32 {d22, d23}, [r0]!\n"
            "vst1.32 {d30, d31}, [r0]!\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [depth] "+r"(depth)
        :  // inputs
        [accum_ptr] "r"(accum_ptr)
        :  // clobbers
        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
        "d28", "d29", "d30", "d31");
  }
};

// This rotating variant performs well when permutations (vext) can be
// dual-issued with arithmetic instructions. It is relevant as the rotating
// approach removes the need for multiply-with-scalar instructions, and ARMv7
// FMA does not have a with-scalar variant.
struct NEON_32bit_GEMM_Float32_FMA_Rotating {
  typedef float OperandType;
  typedef float AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 1> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    asm volatile(
        // Load accumulators
        "mov r0, %[accum_ptr]\n"
        "vld1.32 {d8, d9},   [r0]!\n"
        "vld1.32 {d16, d17}, [r0]!\n"
        "vld1.32 {d24, d25}, [r0]!\n"
        "vld1.32 {d10, d11}, [r0]!\n"
        "vld1.32 {d18, d19}, [r0]!\n"
        "vld1.32 {d26, d27}, [r0]!\n"
        "vld1.32 {d12, d13}, [r0]!\n"
        "vld1.32 {d20, d21}, [r0]!\n"
        "vld1.32 {d28, d29}, [r0]!\n"
        "vld1.32 {d14, d15}, [r0]!\n"
        "vld1.32 {d22, d23}, [r0]!\n"
        "vld1.32 {d30, d31}, [r0]!\n"

        NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(1, 2, 3)

        //"loop_%=:\n"
        GEMMLOWP_LABEL_LOOP
        ":\n"

        // Load 1 Rhs cell of size 1x4
        "vld1.32 {d0, d1}, [%[rhs_ptr]]!\n"

        // Load 3 Lhs cells of size 4x1 each
        "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
        "vld1.32 {d4, d5}, [%[lhs_ptr]]!\n"
        "vld1.32 {d6, d7}, [%[lhs_ptr]]!\n"

        // Multiply-accumulate
        "vfma.f32 q4, q1, q0\n"
        "vfma.f32 q8, q2, q0\n"
        "vfma.f32 q12, q3, q0\n"
        "vext.f32 q0, q0, q0, #1\n"
        "vfma.f32 q5, q1, q0\n"
        "vfma.f32 q9, q2, q0\n"
        "vfma.f32 q13, q3, q0\n"
        "vext.f32 q0, q0, q0, #1\n"
        "vfma.f32 q6, q1, q0\n"
        "vfma.f32 q10, q2, q0\n"
        "vfma.f32 q14, q3, q0\n"
        "vext.f32 q0, q0, q0, #1\n"
        "vfma.f32 q7, q1, q0\n"
        "vfma.f32 q11, q2, q0\n"
        "vfma.f32 q15, q3, q0\n"

        // Loop. Decrement loop index (depth) by 1, since we just handled 1
        // level of depth.
        "subs %[depth], #1\n"
        //"bne loop_%=\n"
        "bne " GEMMLOWP_LABEL_LOOP "b\n"

        NEON_32BIT_ROTATING_FLOAT_KERNEL_ROTATE_ACCUMULATOR_CELLS(3, 2, 1)

        // Store accumulators
        "mov r0, %[accum_ptr]\n"
        "vst1.32 {d8, d9},   [r0]!\n"
        "vst1.32 {d16, d17}, [r0]!\n"
        "vst1.32 {d24, d25}, [r0]!\n"
        "vst1.32 {d10, d11}, [r0]!\n"
        "vst1.32 {d18, d19}, [r0]!\n"
        "vst1.32 {d26, d27}, [r0]!\n"
        "vst1.32 {d12, d13}, [r0]!\n"
        "vst1.32 {d20, d21}, [r0]!\n"
        "vst1.32 {d28, d29}, [r0]!\n"
        "vst1.32 {d14, d15}, [r0]!\n"
        "vst1.32 {d22, d23}, [r0]!\n"
        "vst1.32 {d30, d31}, [r0]!\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [depth] "+r"(depth)
        :  // inputs
        [accum_ptr] "r"(accum_ptr)
        :  // clobbers
        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
        "d28", "d29", "d30", "d31");
  }
};

#endif  // __arm__

#ifdef __aarch64__

// This is the current standard kernel in gemmlowp, see:
// https://github.com/google/gemmlowp/blob/b1e2a29ff866680028f3080efc244e10e8dd7f46/internal/kernel_neon.h#L646
struct NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators {
  typedef std::uint8_t OperandType;
  typedef std::uint32_t AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>,
      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 2> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    asm volatile(
        // Load 1 Rhs cell of size 2x8
        "ld1 {v5.8b}, [%[rhs_ptr]], #8\n"
        "ld1 {v6.8b}, [%[rhs_ptr]], #8\n"

        // Load 3 Lhs cells of size 4x2 each
        "ld1 {v2.8b}, [%[lhs_ptr]], #8\n"
        "ld1 {v3.8b}, [%[lhs_ptr]], #8\n"
        "ld1 {v4.8b}, [%[lhs_ptr]], #8\n"

        "subs %w[depth], %w[depth], #2\n"

        // Load accumulators
        "mov x0, %[accum_ptr]\n"
        "ld1 {v8.16b}, [x0], #16\n"
        "ld1 {v16.16b}, [x0], #16\n"
        "ld1 {v24.16b}, [x0], #16\n"
        "ld1 {v9.16b}, [x0], #16\n"
        "ld1 {v17.16b}, [x0], #16\n"
        "ld1 {v25.16b}, [x0], #16\n"
        "ld1 {v10.16b}, [x0], #16\n"
        "ld1 {v18.16b}, [x0], #16\n"
        "ld1 {v26.16b}, [x0], #16\n"
        "ld1 {v11.16b}, [x0], #16\n"
        "ld1 {v19.16b}, [x0], #16\n"
        "ld1 {v27.16b}, [x0], #16\n"
        "ld1 {v12.16b}, [x0], #16\n"
        "ld1 {v20.16b}, [x0], #16\n"
        "ld1 {v28.16b}, [x0], #16\n"
        "ld1 {v13.16b}, [x0], #16\n"
        "ld1 {v21.16b}, [x0], #16\n"
        "ld1 {v29.16b}, [x0], #16\n"
        "ld1 {v14.16b}, [x0], #16\n"
        "ld1 {v22.16b}, [x0], #16\n"
        "ld1 {v30.16b}, [x0], #16\n"
        "ld1 {v15.16b}, [x0], #16\n"
        "ld1 {v23.16b}, [x0], #16\n"
        "ld1 {v31.16b}, [x0], #16\n"

        "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n"

        //"loop_%=:\n"
        GEMMLOWP_LABEL_LOOP
        ":\n"

        // Overview of register layout:
        //
        // A 2x8 block of 2 2x4 cells of Rhs is stored in 16bit in v0--v1.
        // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in v2--v4.
        // A 12x8 block of accumulators is stored in 32bit in v8--v31.
        //
        //                         +--------+--------+-----+--------+--------+
        //                         |v0.h[0] |v0.h[1] | ... |v1.h[2] |v1.h[3] |
        //                    Rhs  +--------+--------+-----+--------+--------+
        //                         |v0.h[4] |v0.h[5] | ... |v1.h[6] |v1.h[7] |
        //                         +--------+--------+-----+--------+--------+
        //
        //                         |        |        |     |        |        |
        //
        //    Lhs                  |        |        |     |        |        |
        //
        //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
        //  |v2.h[0]|v2.h[4]|      |v8.s[0] |v9.s[0] | ... |v14.s[0]|v15.s[0]|
        //  |v2.h[1]|v2.h[5]|      |v8.s[1] |v9.s[1] | ... |v14.s[1]|v15.s[1]|
        //  |v2.h[2]|v2.h[6]|      |v8.s[2] |v9.s[2] | ... |v14.s[2]|v15.s[2]|
        //  |v2.h[3]|v2.h[7]|      |v8.s[3] |v9.s[3] | ... |v14.s[3]|v15.s[3]|
        //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
        //  |v3.h[0]|v3.h[4]|      |v16.s[0]|v17.s[0]| ... |v22.s[0]|v23.s[0]|
        //  |v3.h[1]|v3.h[5]|      |v16.s[1]|v17.s[1]| ... |v22.s[1]|v23.s[1]|
        //  |v3.h[2]|v3.h[6]|      |v16.s[2]|v17.s[2]| ... |v22.s[2]|v23.s[2]|
        //  |v3.h[3]|v3.h[7]|      |v16.s[3]|v17.s[3]| ... |v22.s[3]|v23.s[3]|
        //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
        //  |v4.h[0]|v4.h[4]|      |v24.s[0]|v25.s[0]| ... |v30.s[0]|v31.s[0]|
        //  |v4.h[1]|v4.h[5]|      |v24.s[1]|v25.s[1]| ... |v30.s[1]|v31.s[1]|
        //  |v4.h[2]|v4.h[6]|      |v24.s[2]|v25.s[2]| ... |v30.s[2]|v31.s[2]|
        //  |v4.h[3]|v4.h[7]|      |v24.s[3]|v25.s[3]| ... |v30.s[3]|v31.s[3]|
        //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
        //
        //                            Accumulator

        // Expand Lhs/Rhs cells to 16 bit.
        "uxtl v0.8h, v5.8b\n"
        "ld1 {v5.8b}, [%[rhs_ptr]], #8\n"
        "uxtl v1.8h, v6.8b\n"
        "ld1 {v6.8b}, [%[rhs_ptr]], #8\n"
        "uxtl v2.8h, v2.8b\n"
        "uxtl v3.8h, v3.8b\n"
        "uxtl v4.8h, v4.8b\n"

        // Multiply-accumulate, top third
        "umlal v8.4s, v2.4h, v0.h[0]\n"
        "umlal v9.4s, v2.4h, v0.h[1]\n"
        "umlal v10.4s, v2.4h, v0.h[2]\n"
        "umlal v11.4s, v2.4h, v0.h[3]\n"
        "umlal v12.4s, v2.4h, v1.h[0]\n"
        "umlal v13.4s, v2.4h, v1.h[1]\n"
        "umlal v14.4s, v2.4h, v1.h[2]\n"
        "umlal v15.4s, v2.4h, v1.h[3]\n"
        "umlal2 v8.4s, v2.8h, v0.h[4]\n"
        "umlal2 v9.4s, v2.8h, v0.h[5]\n"
        "umlal2 v10.4s, v2.8h, v0.h[6]\n"
        "umlal2 v11.4s, v2.8h, v0.h[7]\n"
        "umlal2 v12.4s, v2.8h, v1.h[4]\n"
        "umlal2 v13.4s, v2.8h, v1.h[5]\n"
        "umlal2 v14.4s, v2.8h, v1.h[6]\n"
        "umlal2 v15.4s, v2.8h, v1.h[7]\n"
        "ld1 {v2.8b}, [%[lhs_ptr]], #8\n"

        // Multiply-accumulate, middle third
        "umlal v16.4s, v3.4h, v0.h[0]\n"
        "umlal v17.4s, v3.4h, v0.h[1]\n"
        "umlal v18.4s, v3.4h, v0.h[2]\n"
        "umlal v19.4s, v3.4h, v0.h[3]\n"
        "umlal v20.4s, v3.4h, v1.h[0]\n"
        "umlal v21.4s, v3.4h, v1.h[1]\n"
        "umlal v22.4s, v3.4h, v1.h[2]\n"
        "umlal v23.4s, v3.4h, v1.h[3]\n"
        "umlal2 v16.4s, v3.8h, v0.h[4]\n"
        "umlal2 v17.4s, v3.8h, v0.h[5]\n"
        "umlal2 v18.4s, v3.8h, v0.h[6]\n"
        "umlal2 v19.4s, v3.8h, v0.h[7]\n"
        "umlal2 v20.4s, v3.8h, v1.h[4]\n"
        "umlal2 v21.4s, v3.8h, v1.h[5]\n"
        "umlal2 v22.4s, v3.8h, v1.h[6]\n"
        "umlal2 v23.4s, v3.8h, v1.h[7]\n"
        "ld1 {v3.8b}, [%[lhs_ptr]], #8\n"

        "subs %w[depth], %w[depth], #2\n"

        // Multiply-accumulate, bottom third
        "umlal v24.4s, v4.4h, v0.h[0]\n"
        "umlal v25.4s, v4.4h, v0.h[1]\n"
        "umlal v26.4s, v4.4h, v0.h[2]\n"
        "umlal v27.4s, v4.4h, v0.h[3]\n"
        "umlal v28.4s, v4.4h, v1.h[0]\n"
        "umlal v29.4s, v4.4h, v1.h[1]\n"
        "umlal v30.4s, v4.4h, v1.h[2]\n"
        "umlal v31.4s, v4.4h, v1.h[3]\n"
        "umlal2 v24.4s, v4.8h, v0.h[4]\n"
        "umlal2 v25.4s, v4.8h, v0.h[5]\n"
        "umlal2 v26.4s, v4.8h, v0.h[6]\n"
        "umlal2 v27.4s, v4.8h, v0.h[7]\n"
        "umlal2 v28.4s, v4.8h, v1.h[4]\n"
        "umlal2 v29.4s, v4.8h, v1.h[5]\n"
        "umlal2 v30.4s, v4.8h, v1.h[6]\n"
        "umlal2 v31.4s, v4.8h, v1.h[7]\n"
        "ld1 {v4.8b}, [%[lhs_ptr]], #8\n"

        "bne " GEMMLOWP_LABEL_LOOP "b\n"

        GEMMLOWP_LABEL_AFTER_LOOP
        ":\n"

        // Expand Lhs/Rhs cells to 16 bit.
        "uxtl v0.8h, v5.8b\n"
        "uxtl v1.8h, v6.8b\n"
        "uxtl v2.8h, v2.8b\n"
        "uxtl v3.8h, v3.8b\n"
        "uxtl v4.8h, v4.8b\n"

        // Multiply-accumulate, level of depth 0
        "umlal v8.4s, v2.4h, v0.h[0]\n"
        "umlal v9.4s, v2.4h, v0.h[1]\n"
        "umlal v10.4s, v2.4h, v0.h[2]\n"
        "umlal v11.4s, v2.4h, v0.h[3]\n"
        "umlal v12.4s, v2.4h, v1.h[0]\n"
        "umlal v13.4s, v2.4h, v1.h[1]\n"
        "umlal v14.4s, v2.4h, v1.h[2]\n"
        "umlal v15.4s, v2.4h, v1.h[3]\n"
        "umlal v16.4s, v3.4h, v0.h[0]\n"
        "umlal v17.4s, v3.4h, v0.h[1]\n"
        "umlal v18.4s, v3.4h, v0.h[2]\n"
        "umlal v19.4s, v3.4h, v0.h[3]\n"
        "umlal v20.4s, v3.4h, v1.h[0]\n"
        "umlal v21.4s, v3.4h, v1.h[1]\n"
        "umlal v22.4s, v3.4h, v1.h[2]\n"
        "umlal v23.4s, v3.4h, v1.h[3]\n"
        "umlal v24.4s, v4.4h, v0.h[0]\n"
        "umlal v25.4s, v4.4h, v0.h[1]\n"
        "umlal v26.4s, v4.4h, v0.h[2]\n"
        "umlal v27.4s, v4.4h, v0.h[3]\n"
        "umlal v28.4s, v4.4h, v1.h[0]\n"
        "umlal v29.4s, v4.4h, v1.h[1]\n"
        "umlal v30.4s, v4.4h, v1.h[2]\n"
        "umlal v31.4s, v4.4h, v1.h[3]\n"

        // Multiply-accumulate, level of depth 1
        "umlal2 v8.4s, v2.8h, v0.h[4]\n"
        "umlal2 v9.4s, v2.8h, v0.h[5]\n"
        "umlal2 v10.4s, v2.8h, v0.h[6]\n"
        "umlal2 v11.4s, v2.8h, v0.h[7]\n"
        "umlal2 v12.4s, v2.8h, v1.h[4]\n"
        "umlal2 v13.4s, v2.8h, v1.h[5]\n"
        "umlal2 v14.4s, v2.8h, v1.h[6]\n"
        "umlal2 v15.4s, v2.8h, v1.h[7]\n"
        "umlal2 v16.4s, v3.8h, v0.h[4]\n"
        "umlal2 v17.4s, v3.8h, v0.h[5]\n"
        "umlal2 v18.4s, v3.8h, v0.h[6]\n"
        "umlal2 v19.4s, v3.8h, v0.h[7]\n"
        "umlal2 v20.4s, v3.8h, v1.h[4]\n"
        "umlal2 v21.4s, v3.8h, v1.h[5]\n"
        "umlal2 v22.4s, v3.8h, v1.h[6]\n"
        "umlal2 v23.4s, v3.8h, v1.h[7]\n"
        "umlal2 v24.4s, v4.8h, v0.h[4]\n"
        "umlal2 v25.4s, v4.8h, v0.h[5]\n"
        "umlal2 v26.4s, v4.8h, v0.h[6]\n"
        "umlal2 v27.4s, v4.8h, v0.h[7]\n"
        "umlal2 v28.4s, v4.8h, v1.h[4]\n"
        "umlal2 v29.4s, v4.8h, v1.h[5]\n"
        "umlal2 v30.4s, v4.8h, v1.h[6]\n"
        "umlal2 v31.4s, v4.8h, v1.h[7]\n"

        // Store accumulators
        "mov x0, %[accum_ptr]\n"
        "st1 {v8.16b}, [x0], #16\n"
        "st1 {v16.16b}, [x0], #16\n"
        "st1 {v24.16b}, [x0], #16\n"
        "st1 {v9.16b}, [x0], #16\n"
        "st1 {v17.16b}, [x0], #16\n"
        "st1 {v25.16b}, [x0], #16\n"
        "st1 {v10.16b}, [x0], #16\n"
        "st1 {v18.16b}, [x0], #16\n"
        "st1 {v26.16b}, [x0], #16\n"
        "st1 {v11.16b}, [x0], #16\n"
        "st1 {v19.16b}, [x0], #16\n"
        "st1 {v27.16b}, [x0], #16\n"
        "st1 {v12.16b}, [x0], #16\n"
        "st1 {v20.16b}, [x0], #16\n"
        "st1 {v28.16b}, [x0], #16\n"
        "st1 {v13.16b}, [x0], #16\n"
        "st1 {v21.16b}, [x0], #16\n"
        "st1 {v29.16b}, [x0], #16\n"
        "st1 {v14.16b}, [x0], #16\n"
        "st1 {v22.16b}, [x0], #16\n"
        "st1 {v30.16b}, [x0], #16\n"
        "st1 {v15.16b}, [x0], #16\n"
        "st1 {v23.16b}, [x0], #16\n"
        "st1 {v31.16b}, [x0], #16\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [depth] "+r"(depth)
        :  // inputs
        [accum_ptr] "r"(accum_ptr)
        :  // clobbers
        "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
        "v28", "v29", "v30", "v31");
  }
};

// Faster kernel by ARM. Not expanding operands before multiplication.
// Tuned for A57. Compare to
// NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators_noexpand
struct NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_noexpand_A57 {
  typedef std::uint8_t OperandType;
  typedef std::uint32_t AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<5, 16, CellOrder::WidthMajor>, 1>,
      KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    static const int kLhsWidth = Format::Lhs::kWidth;
    static const int kRhsWidth = Format::Rhs::kWidth;
    AccumulatorType rowmajor_accumulator_buffer[kLhsWidth * kRhsWidth];
    asm volatile(
        // Clear aggregators
        "dup v12.4s, wzr\n"
        "dup v13.4s, wzr\n"
        "dup v14.4s, wzr\n"
        "dup v15.4s, wzr\n"
        "dup v16.4s, wzr\n"
        "dup v17.4s, wzr\n"
        "dup v18.4s, wzr\n"
        "dup v19.4s, wzr\n"
        "dup v20.4s, wzr\n"
        "dup v21.4s, wzr\n"
        "dup v22.4s, wzr\n"
        "dup v23.4s, wzr\n"
        "dup v24.4s, wzr\n"
        "dup v25.4s, wzr\n"
        "dup v26.4s, wzr\n"
        "dup v27.4s, wzr\n"
        "dup v28.4s, wzr\n"
        "dup v29.4s, wzr\n"
        "dup v30.4s, wzr\n"
        "dup v31.4s, wzr\n"

        GEMMLOWP_LABEL_LOOP
        ":\n"

        // Overview of register layout:
        //
        // A 4x16 block of Rhs is stored in 8 bit in v0--v3.
        // A 5x16 block of Lhs is cycled through v4 and v5 in 8 bit.
        //
        // A 4x5 block of aggregators is stored in v12-v31 (as 4x32 bit
        // components which would need to be added at the end)
        //
        // The Lhs vectors are multiplied by the Rhs vectors with a widening
        // multiply to produce an intermediate result which is stored in
        // v6-v11.  Each intermediate result is 8x16 bits so this happens
        // twice for each Lhs/Rhs combination (once with UMULL for elements
        // 0-7 and once with UMULL2 for elements 8-15).
        //
        // UADALP is used to accumulate these intermediate results into the
        // result aggregators.
        //
        //
        //
        //                               +--------+--------+--------+--------+
        //                               |v0.b[0] |v1.b[0] |v2.b[0] |v3.b[0] |
        //                          Rhs  +--------+--------+--------+--------+
        //                               |  ...   |  ...   |  ...   |  ...   |
        //                               +--------+--------+--------+--------|
        //                               |v0.b[15]|v1.b[15]|v2.b[15]|v3.b[15]|
        //                               +--------+--------+--------+--------+
        //
        //                               |        |        |        |        |
        //
        //    Lhs                        |        |        |        |        |
        //
        //  +-------+-----+--------+ - - +--------+--------+--------+--------+
        //  |v4.b[0]| ... |v4.b[15]|     | v12.4s | v13.4s | v14.4s | v15.4s |
        //  |v5.b[0]| ... |v5.b[15]|     | v16.4s | v17.4s | v18.4s | v19.4s |
        //  |v4.b[0]| ... |v4.b[15]|     | v20.4s | v21.4s | v22.4s | v23.4s |
        //  |v5.b[0]| ... |v5.b[15]|     | v24.4s | v25.4s | v26.4s | v27.4s |
        //  |v4.b[0]| ... |v4.b[15]|     | v28.4s | v29.4s | v30.4s | v31.4s |
        //  +-------+--------------+ - - +--------+--------+--------+--------+
        //
        //                                                Accumulator
        //
        //
        // Further possible optimisations (not tried):
        //   - Move early loads into previous iteration (see Float32_WithScalar
        //   for example). - Unroll loop 2x to alternate more smoothly between
        //   v4 and v5. - A different number of temporary registers might work
        //   better. - Pairing umull with corresponding umull2 might allow
        //   better
        //     register loading (e.g. at the start of the loop)
        //   - Interleaving umull{2} and uadalp even more aggressively might
        //     help, (not sure about latency vs. dispatch rate).
        //
        //
        // Start loading Rhs - further loads are interleaved amongst the
        // multiplies for better dispatch on A57.
        "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"

        // Load first Lhs vector - further loads are interleaved amongst the
        // multiplies
        "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"

        "umull    v6.8h,  v0.8b,  v4.8b\n"
        "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"  // 2nd RHS element
        "umull    v7.8h,  v1.8b,  v4.8b\n"
        "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"  // 3rd RHS element
        "umull    v8.8h,  v2.8b,  v4.8b\n"
        "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"  // 4th RHS element
        "umull    v9.8h,  v3.8b,  v4.8b\n"
        "umull2  v10.8h, v0.16b, v4.16b\n"
        "umull2  v11.8h, v1.16b, v4.16b\n"
        "ld1 {v5.16b}, [%[lhs_ptr]], #16\n"  // 2nd LHS element

        "uadalp  v12.4s, v6.8h\n"
        "umull2   v6.8h, v2.16b, v4.16b\n"
        "uadalp  v13.4s, v7.8h\n"
        "umull2   v7.8h, v3.16b, v4.16b\n"
        "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"  // 1st LHS element done - Reuse v4
        // for 3rd LHS element
        "uadalp  v14.4s, v8.8h\n"
        "umull    v8.8h,  v0.8b,  v5.8b\n"
        "uadalp  v15.4s, v9.8h\n"
        "umull    v9.8h,  v1.8b,  v5.8b\n"
        "uadalp  v12.4s, v10.8h\n"
        "umull   v10.8h,  v2.8b,  v5.8b\n"
        "uadalp  v13.4s, v11.8h\n"
        "umull   v11.8h,  v3.8b,  v5.8b\n"

        "uadalp  v14.4s, v6.8h\n"
        "umull2   v6.8h, v0.16b, v5.16b\n"
        "uadalp  v15.4s, v7.8h\n"
        "umull2   v7.8h, v1.16b, v5.16b\n"
        "uadalp  v16.4s, v8.8h\n"
        "umull2   v8.8h, v2.16b, v5.16b\n"
        "uadalp  v17.4s, v9.8h\n"
        "umull2   v9.8h, v3.16b, v5.16b\n"
        "ld1 {v5.16b}, [%[lhs_ptr]], #16\n"  // 2nd LHS element done - Reuse v5
        // for 4th LHS element
        "uadalp  v18.4s, v10.8h\n"
        "umull   v10.8h,  v0.8b,  v4.8b\n"
        "uadalp  v19.4s, v11.8h\n"
        "umull   v11.8h,  v1.8b,  v4.8b\n"

        "uadalp  v16.4s, v6.8h\n"
        "umull    v6.8h,  v2.8b,  v4.8b\n"
        "uadalp  v17.4s, v7.8h\n"
        "umull    v7.8h,  v3.8b,  v4.8b\n"
        "uadalp  v18.4s, v8.8h\n"
        "umull2   v8.8h, v0.16b, v4.16b\n"
        "uadalp  v19.4s, v9.8h\n"
        "umull2   v9.8h, v1.16b, v4.16b\n"
        "uadalp  v20.4s, v10.8h\n"
        "umull2  v10.8h, v2.16b, v4.16b\n"
        "uadalp  v21.4s, v11.8h\n"
        "umull2  v11.8h, v3.16b, v4.16b\n"
        "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"  // 3rd LHS element done - Reuse v4
        // for 5th LHS element

        "uadalp v22.4s, v6.8h\n"
        "umull    v6.8h,  v0.8b,  v5.8b\n"
        "uadalp v23.4s, v7.8h\n"
        "umull    v7.8h,  v1.8b,  v5.8b\n"
        "uadalp v20.4s, v8.8h\n"
        "umull    v8.8h,  v2.8b,  v5.8b\n"
        "uadalp v21.4s, v9.8h\n"
        "umull    v9.8h,  v3.8b,  v5.8b\n"
        "uadalp v22.4s, v10.8h\n"
        "umull2  v10.8h, v0.16b, v5.16b\n"
        "uadalp v23.4s, v11.8h\n"
        "umull2  v11.8h, v1.16b, v5.16b\n"

        "uadalp v24.4s, v6.8h\n"
        "umull2   v6.8h,  v2.16b, v5.16b\n"
        "uadalp v25.4s, v7.8h\n"
        "umull2   v7.8h,  v3.16b, v5.16b\n"
        "uadalp v26.4s, v8.8h\n"
        "umull    v8.8h,  v0.8b,  v4.8b\n"
        "uadalp v27.4s, v9.8h\n"
        "umull    v9.8h,  v1.8b,  v4.8b\n"
        "uadalp v24.4s, v10.8h\n"
        "umull   v10.8h,  v2.8b,  v4.8b\n"
        "uadalp v25.4s, v11.8h\n"
        "umull   v11.8h,  v3.8b,  v4.8b\n"

        "uadalp v26.4s, v6.8h\n"
        "umull2   v6.8h, v0.16b, v4.16b\n"
        "uadalp v27.4s, v7.8h\n"
        "umull2   v7.8h, v1.16b, v4.16b\n"
        "uadalp v28.4s, v8.8h\n"
        "umull2   v8.8h, v2.16b, v4.16b\n"
        "uadalp v29.4s, v9.8h\n"
        "umull2   v9.8h, v3.16b, v4.16b\n"
        "uadalp v30.4s, v10.8h\n"
        "uadalp v31.4s, v11.8h\n"

        "uadalp v28.4s, v6.8h\n"
        "uadalp v29.4s, v7.8h\n"
        // Loop. Decrement loop index (depth) by 16, since we just handled
        // 16 levels of depth.  Do this subs a bit before the end of the loop
        // for better dispatch on A57.
        "subs %w[depth], %w[depth], #16\n"
        "uadalp v30.4s, v8.8h\n"
        "uadalp v31.4s, v9.8h\n"

        "bne " GEMMLOWP_LABEL_LOOP
        "b\n"

        // Reduce aggregators horizontally
        "addp v0.4s, v12.4s, v13.4s\n"
        "addp v1.4s, v14.4s, v15.4s\n"
        "addp v2.4s, v16.4s, v17.4s\n"
        "addp v3.4s, v18.4s, v19.4s\n"
        "addp v4.4s, v20.4s, v21.4s\n"
        "addp v5.4s, v22.4s, v23.4s\n"
        "addp v6.4s, v24.4s, v25.4s\n"
        "addp v7.4s, v26.4s, v27.4s\n"
        "addp v8.4s, v28.4s, v29.4s\n"
        "addp v9.4s, v30.4s, v31.4s\n"

        "addp v10.4s, v0.4s, v1.4s\n"
        "addp v11.4s, v2.4s, v3.4s\n"
        "addp v12.4s, v4.4s, v5.4s\n"
        "addp v13.4s, v6.4s, v7.4s\n"
        "addp v14.4s, v8.4s, v9.4s\n"

        "mov x0, %[rowmajor_accumulator_buffer]\n"
        "st1 {v10.16b}, [x0], #16\n"
        "st1 {v11.16b}, [x0], #16\n"
        "st1 {v12.16b}, [x0], #16\n"
        "st1 {v13.16b}, [x0], #16\n"
        "st1 {v14.16b}, [x0], #16\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [depth] "+r"(depth)
        :  // inputs
        [rowmajor_accumulator_buffer] "r"(rowmajor_accumulator_buffer)
        :  // clobbers
        "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
        "v28", "v29", "v30", "v31");

    // accumulate row-major accumulators into global (column-major) accumulators
    for (int l = 0; l < kLhsWidth; l++) {
      for (int r = 0; r < kRhsWidth; r++) {
        accum_ptr[l + kLhsWidth * r] +=
            rowmajor_accumulator_buffer[r + l * kRhsWidth];
      }
    }
  }
};

// Fast kernel operating on int8 operands.
// It is assumed that one of the two int8 operands only takes values
// in [-127, 127], while the other may freely range in [-128, 127].
// The issue with both operands taking the value -128 is that:
// -128*-128 + -128*-128 == -32768 overflows int16.
// Every other expression a*b + c*d, for any int8 a,b,c,d, fits in int16
// range. That is the basic idea of this kernel.
struct NEON_64bit_GEMM_Int8Operands_AccumTwoWithin16Bits {
  typedef std::int8_t OperandType;
  typedef std::int32_t AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
      KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    std::size_t start_depth = 123;
    std::size_t run_depth = depth;
    std::size_t dst_col_stride = 4;
    AccumulatorType* dst_ptr = accum_ptr;
    asm volatile(
        // Overview of register layout:
        //
        // A 4x16 block of Rhs is stored in 8 bit in v0--v3.
        // A 4x16 block of Lhs is stored in 8 bit in v4--v7.
        //
        // A 4x4 block of accumulators is stored in v16-v31 (as 4x32 bit
        // components which need to be horizontally-added at the end)
        //
        // The Lhs vectors are multiplied by the Rhs vectors with a widening
        // multiply over the 8 first levels of depth, producing int16x8
        // vectors of products for each position in the accumulator matrix.
        // Here comes the special trick: since the operands are signed int8,
        // their range being [ -2^7 , 2^7 ), their products are in range
        // [ -2^14 , 2^14 - 1 ), meaning that we can add two such values
        // without any risk of overflowing int16.
        // We thus proceed with the 8 next levels of depth, multiplying
        // again Lhs by Rhs, accumulating into this existing int16x8 vector.
        //
        // Only then, having processed 16 levels of depth, do we need to
        // horizontally add these int16x8 accumulators into the final
        // int32x4 accumulators.
        //
        // As we do not have enough registers to store all 16 int16x8
        // temporary-16bit-accumulators, we have them cycle through v8--v15.
        //
        //
        // Register layout (ignoring the v8--v15 temporary 16bit accumulators):
        //
        //                               +--------+--------+--------+--------+
        //                               |v0.b[0] |v1.b[0] |v2.b[0] |v3.b[0] |
        //                          Rhs  +--------+--------+--------+--------+
        //                               |  ...   |  ...   |  ...   |  ...   |
        //                               +--------+--------+--------+--------|
        //                               |v0.b[15]|v1.b[15]|v2.b[15]|v3.b[15]|
        //                               +--------+--------+--------+--------+
        //
        //                               |        |        |        |        |
        //
        //    Lhs                        |        |        |        |        |
        //
        //  +-------+-----+--------+ - - +--------+--------+--------+--------+
        //  |v4.b[0]| ... |v4.b[15]|     | v16.4s | v17.4s | v18.4s | v19.4s |
        //  |v5.b[0]| ... |v5.b[15]|     | v20.4s | v21.4s | v22.4s | v23.4s |
        //  |v6.b[0]| ... |v6.b[15]|     | v24.4s | v25.4s | v26.4s | v27.4s |
        //  |v7.b[0]| ... |v7.b[15]|     | v28.4s | v29.4s | v30.4s | v31.4s |
        //  +-------+--------------+ - - +--------+--------+--------+--------+
        //
        //                                                Accumulator
        //

        // Clear accumulators
        "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"
        "dup v16.4s, wzr\n"
        "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"
        "dup v17.4s, wzr\n"
        "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
        "dup v18.4s, wzr\n"
        "ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
        "dup v19.4s, wzr\n"
        "ld1 {v6.16b}, [%[lhs_ptr]], #16\n"
        "dup v20.4s, wzr\n"
        "ld1 {v7.16b}, [%[lhs_ptr]], #16\n"
        "dup v21.4s, wzr\n"
        "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
        "dup v22.4s, wzr\n"
        "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
        "dup v23.4s, wzr\n"
        "subs %[run_depth], %[run_depth], #16\n"
        "dup v24.4s, wzr\n"
        "mov x0, %[dst_ptr]\n"
        "dup v25.4s, wzr\n"
        "dup v26.4s, wzr\n"
        "dup v27.4s, wzr\n"
        "dup v28.4s, wzr\n"
        "dup v29.4s, wzr\n"
        "dup v30.4s, wzr\n"
        "dup v31.4s, wzr\n"

        "smull    v12.8h,  v0.8b,  v4.8b\n"
        "smull    v13.8h,  v1.8b,  v4.8b\n"
        "smull    v14.8h,  v0.8b,  v5.8b\n"
        "smull    v15.8h,  v1.8b,  v5.8b\n"
        "smlal2   v12.8h,  v0.16b,  v4.16b\n"
        "smlal2   v13.8h,  v1.16b,  v4.16b\n"
        "smlal2   v14.8h,  v0.16b,  v5.16b\n"
        "smlal2   v15.8h,  v1.16b,  v5.16b\n"

        "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n"

        GEMMLOWP_LABEL_LOOP
        ":\n"

        "subs %[run_depth], %[run_depth], #16\n"

        "sadalp  v16.4s, v12.8h\n"
        "smull    v12.8h,  v0.8b,  v6.8b\n"
        "sadalp  v17.4s, v13.8h\n"
        "smull    v13.8h,  v0.8b,  v7.8b\n"
        "sadalp  v20.4s, v14.8h\n"
        "smull    v14.8h,  v1.8b,  v6.8b\n"
        "sadalp  v21.4s, v15.8h\n"
        "smull    v15.8h,  v1.8b,  v7.8b\n"
        "smlal2   v12.8h,  v0.16b,  v6.16b\n"
        "smlal2   v13.8h,  v0.16b,  v7.16b\n"
        "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"
        "smlal2   v14.8h,  v1.16b,  v6.16b\n"
        "smlal2   v15.8h,  v1.16b,  v7.16b\n"
        "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"
        "sadalp  v24.4s, v12.8h\n"
        "smull    v12.8h,  v2.8b,  v4.8b\n"
        "sadalp  v28.4s, v13.8h\n"
        "smull    v13.8h,  v3.8b,  v4.8b\n"
        "sadalp  v25.4s, v14.8h\n"
        "smull    v14.8h,  v2.8b,  v5.8b\n"
        "sadalp  v29.4s, v15.8h\n"
        "smull    v15.8h,  v3.8b,  v5.8b\n"
        "smlal2   v12.8h,  v2.16b,  v4.16b\n"
        "smlal2   v13.8h,  v3.16b,  v4.16b\n"
        "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
        "smlal2   v14.8h,  v2.16b,  v5.16b\n"
        "smlal2   v15.8h,  v3.16b,  v5.16b\n"
        "ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
        "sadalp  v18.4s, v12.8h\n"
        "smull    v12.8h,  v2.8b,  v6.8b\n"
        "sadalp  v19.4s, v13.8h\n"
        "smull    v13.8h,  v2.8b,  v7.8b\n"
        "sadalp  v22.4s, v14.8h\n"
        "smull    v14.8h,  v3.8b,  v6.8b\n"
        "sadalp  v23.4s, v15.8h\n"
        "smull    v15.8h,  v3.8b,  v7.8b\n"
        "smlal2   v12.8h,  v2.16b,  v6.16b\n"
        "smlal2   v13.8h,  v2.16b,  v7.16b\n"
        "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
        "smlal2   v14.8h,  v3.16b,  v6.16b\n"
        "ld1 {v6.16b}, [%[lhs_ptr]], #16\n"
        "smlal2   v15.8h,  v3.16b,  v7.16b\n"
        "ld1 {v7.16b}, [%[lhs_ptr]], #16\n"
        "sadalp  v26.4s, v12.8h\n"
        "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
        "smull    v12.8h,  v0.8b,  v4.8b\n"
        "sadalp  v30.4s, v13.8h\n"
        "smull    v13.8h,  v1.8b,  v4.8b\n"
        "sadalp  v27.4s, v14.8h\n"
        "smull    v14.8h,  v0.8b,  v5.8b\n"
        "sadalp  v31.4s, v15.8h\n"
        "smull    v15.8h,  v1.8b,  v5.8b\n"
        "smlal2   v12.8h,  v0.16b,  v4.16b\n"
        "smlal2   v13.8h,  v1.16b,  v4.16b\n"
        "smlal2   v14.8h,  v0.16b,  v5.16b\n"
        "smlal2   v15.8h,  v1.16b,  v5.16b\n"

        "bne " GEMMLOWP_LABEL_LOOP "b\n"

        GEMMLOWP_LABEL_AFTER_LOOP
        ":\n"

        // Load accumulators from memory
        "ld1 {v8.16b}, [x0], #16\n"
        "ld1 {v9.16b}, [x0], #16\n"
        "ld1 {v10.16b}, [x0], #16\n"
        "ld1 {v11.16b}, [x0], #16\n"
        "mov x0, %[dst_ptr]\n"

        // Do the remaining arithmetic for the 16 last levels of depths.
        // All the operands are already loaded.
        "sadalp  v16.4s, v12.8h\n"
        "smull    v12.8h,  v0.8b,  v6.8b\n"
        "sadalp  v17.4s, v13.8h\n"
        "smull    v13.8h,  v0.8b,  v7.8b\n"
        "sadalp  v20.4s, v14.8h\n"
        "smull    v14.8h,  v1.8b,  v6.8b\n"
        "sadalp  v21.4s, v15.8h\n"
        "smull    v15.8h,  v1.8b,  v7.8b\n"
        "smlal2   v12.8h,  v0.16b,  v6.16b\n"
        "smlal2   v13.8h,  v0.16b,  v7.16b\n"
        "smlal2   v14.8h,  v1.16b,  v6.16b\n"
        "smlal2   v15.8h,  v1.16b,  v7.16b\n"
        "sadalp  v24.4s, v12.8h\n"
        "smull    v12.8h,  v2.8b,  v4.8b\n"
        "sadalp  v28.4s, v13.8h\n"
        "smull    v13.8h,  v3.8b,  v4.8b\n"
        "sadalp  v25.4s, v14.8h\n"
        "smull    v14.8h,  v2.8b,  v5.8b\n"
        "sadalp  v29.4s, v15.8h\n"
        "smull    v15.8h,  v3.8b,  v5.8b\n"
        "smlal2   v12.8h,  v2.16b,  v4.16b\n"
        "smlal2   v13.8h,  v3.16b,  v4.16b\n"
        "smlal2   v14.8h,  v2.16b,  v5.16b\n"
        "smlal2   v15.8h,  v3.16b,  v5.16b\n"
        "sadalp  v18.4s, v12.8h\n"
        "smull    v12.8h,  v2.8b,  v6.8b\n"
        "sadalp  v19.4s, v13.8h\n"
        "smull    v13.8h,  v2.8b,  v7.8b\n"
        "sadalp  v22.4s, v14.8h\n"
        "smull    v14.8h,  v3.8b,  v6.8b\n"
        "sadalp  v23.4s, v15.8h\n"
        "smull    v15.8h,  v3.8b,  v7.8b\n"
        "smlal2   v12.8h,  v2.16b,  v6.16b\n"
        "smlal2   v13.8h,  v2.16b,  v7.16b\n"
        "smlal2   v14.8h,  v3.16b,  v6.16b\n"
        "smlal2   v15.8h,  v3.16b,  v7.16b\n"
        "sadalp  v26.4s, v12.8h\n"
        "sadalp  v30.4s, v13.8h\n"
        "sadalp  v27.4s, v14.8h\n"
        "sadalp  v31.4s, v15.8h\n"

        // Reduce aggregators horizontally
        "addp v0.4s, v16.4s, v20.4s\n"
        "addp v1.4s, v17.4s, v21.4s\n"
        "addp v2.4s, v18.4s, v22.4s\n"
        "addp v3.4s, v19.4s, v23.4s\n"
        "addp v4.4s, v24.4s, v28.4s\n"
        "addp v5.4s, v25.4s, v29.4s\n"
        "addp v6.4s, v26.4s, v30.4s\n"
        "addp v7.4s, v27.4s, v31.4s\n"

        "addp v12.4s, v0.4s, v4.4s\n"
        "addp v13.4s, v1.4s, v5.4s\n"
        "addp v14.4s, v2.4s, v6.4s\n"
        "addp v15.4s, v3.4s, v7.4s\n"

        // Add to the accumulators loaded from memory
        "add v8.4s, v8.4s, v12.4s\n"
        "add v9.4s, v9.4s, v13.4s\n"
        "add v10.4s, v10.4s, v14.4s\n"
        "add v11.4s, v11.4s, v15.4s\n"

        // Store accumulators back to memory
        "st1 {v8.16b}, [x0], #16\n"
        "st1 {v9.16b}, [x0], #16\n"
        "st1 {v10.16b}, [x0], #16\n"
        "st1 {v11.16b}, [x0], #16\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [dst_ptr] "+r"(dst_ptr), [run_depth] "+r"(run_depth),
        [dst_col_stride] "+r"(dst_col_stride)
        :  // inputs
        [start_depth] "r"(start_depth)
        :  // clobbers
        "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
        "v28", "v29", "v30", "v31");
  }
};

// We don't actually use int32*int32 in production. This is just an
// experiment to help dissociate the effect of integer-vs-float, from the
// effect of operands width.
struct NEON_64bit_GEMM_Int32_WithScalar {
  typedef std::int32_t OperandType;
  typedef std::int32_t AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 2> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    asm volatile(
        // Load accumulators
        "mov x0, %[accum_ptr]\n"
        "ld1 {v8.16b}, [x0], #16\n"
        "ld1 {v16.16b}, [x0], #16\n"
        "ld1 {v24.16b}, [x0], #16\n"
        "ld1 {v9.16b}, [x0], #16\n"
        "ld1 {v17.16b}, [x0], #16\n"
        "ld1 {v25.16b}, [x0], #16\n"
        "ld1 {v10.16b}, [x0], #16\n"
        "ld1 {v18.16b}, [x0], #16\n"
        "ld1 {v26.16b}, [x0], #16\n"
        "ld1 {v11.16b}, [x0], #16\n"
        "ld1 {v19.16b}, [x0], #16\n"
        "ld1 {v27.16b}, [x0], #16\n"
        "ld1 {v12.16b}, [x0], #16\n"
        "ld1 {v20.16b}, [x0], #16\n"
        "ld1 {v28.16b}, [x0], #16\n"
        "ld1 {v13.16b}, [x0], #16\n"
        "ld1 {v21.16b}, [x0], #16\n"
        "ld1 {v29.16b}, [x0], #16\n"
        "ld1 {v14.16b}, [x0], #16\n"
        "ld1 {v22.16b}, [x0], #16\n"
        "ld1 {v30.16b}, [x0], #16\n"
        "ld1 {v15.16b}, [x0], #16\n"
        "ld1 {v23.16b}, [x0], #16\n"
        "ld1 {v31.16b}, [x0], #16\n"

        GEMMLOWP_LABEL_LOOP
        ":\n"

        // Load 2 Rhs cell of size 1x4 each
        "ld1 {v0.4s}, [%[rhs_ptr]], #16\n"
        "ld1 {v1.4s}, [%[rhs_ptr]], #16\n"

        // Load 3 Lhs cells of size 4x1 each
        "ld1 {v2.4s}, [%[lhs_ptr]], #16\n"
        "ld1 {v3.4s}, [%[lhs_ptr]], #16\n"
        "ld1 {v4.4s}, [%[lhs_ptr]], #16\n"

        // Multiply-accumulate
        "mla v8.4s, v2.4s, v0.s[0]\n"
        "mla v9.4s, v2.4s, v0.s[1]\n"
        "mla v10.4s, v2.4s, v0.s[2]\n"
        "mla v11.4s, v2.4s, v0.s[3]\n"
        "mla v12.4s, v2.4s, v1.s[0]\n"
        "mla v13.4s, v2.4s, v1.s[1]\n"
        "mla v14.4s, v2.4s, v1.s[2]\n"
        "mla v15.4s, v2.4s, v1.s[3]\n"
        "mla v16.4s, v3.4s, v0.s[0]\n"
        "mla v17.4s, v3.4s, v0.s[1]\n"
        "mla v18.4s, v3.4s, v0.s[2]\n"
        "mla v19.4s, v3.4s, v0.s[3]\n"
        "mla v20.4s, v3.4s, v1.s[0]\n"
        "mla v21.4s, v3.4s, v1.s[1]\n"
        "mla v22.4s, v3.4s, v1.s[2]\n"
        "mla v23.4s, v3.4s, v1.s[3]\n"
        "mla v24.4s, v4.4s, v0.s[0]\n"
        "mla v25.4s, v4.4s, v0.s[1]\n"
        "mla v26.4s, v4.4s, v0.s[2]\n"
        "mla v27.4s, v4.4s, v0.s[3]\n"
        "mla v28.4s, v4.4s, v1.s[0]\n"
        "mla v29.4s, v4.4s, v1.s[1]\n"
        "mla v30.4s, v4.4s, v1.s[2]\n"
        "mla v31.4s, v4.4s, v1.s[3]\n"

        // Loop. Decrement loop index (depth) by 1, since we just handled 1
        // level of depth.
        "subs %w[depth], %w[depth], #1\n"
        "bne " GEMMLOWP_LABEL_LOOP
        "b\n"

        // Store accumulators
        "mov x0, %[accum_ptr]\n"
        "st1 {v8.16b}, [x0], #16\n"
        "st1 {v16.16b}, [x0], #16\n"
        "st1 {v24.16b}, [x0], #16\n"
        "st1 {v9.16b}, [x0], #16\n"
        "st1 {v17.16b}, [x0], #16\n"
        "st1 {v25.16b}, [x0], #16\n"
        "st1 {v10.16b}, [x0], #16\n"
        "st1 {v18.16b}, [x0], #16\n"
        "st1 {v26.16b}, [x0], #16\n"
        "st1 {v11.16b}, [x0], #16\n"
        "st1 {v19.16b}, [x0], #16\n"
        "st1 {v27.16b}, [x0], #16\n"
        "st1 {v12.16b}, [x0], #16\n"
        "st1 {v20.16b}, [x0], #16\n"
        "st1 {v28.16b}, [x0], #16\n"
        "st1 {v13.16b}, [x0], #16\n"
        "st1 {v21.16b}, [x0], #16\n"
        "st1 {v29.16b}, [x0], #16\n"
        "st1 {v14.16b}, [x0], #16\n"
        "st1 {v22.16b}, [x0], #16\n"
        "st1 {v30.16b}, [x0], #16\n"
        "st1 {v15.16b}, [x0], #16\n"
        "st1 {v23.16b}, [x0], #16\n"
        "st1 {v31.16b}, [x0], #16\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [depth] "+r"(depth)
        :  // inputs
        [accum_ptr] "r"(accum_ptr)
        :  // clobbers
        "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
        "v28", "v29", "v30", "v31");
  }
};

// Not very efficient kernel, just an experiment to see what we can do
// without using NEON multiply-with-scalar instructions.
struct NEON_64bit_GEMM_Float32_WithVectorDuplicatingScalar {
  typedef float OperandType;
  typedef float AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 2> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    asm volatile(
        // Load accumulators
        "mov x0, %[accum_ptr]\n"
        "ld1 {v8.16b}, [x0], #16\n"
        "ld1 {v16.16b}, [x0], #16\n"
        "ld1 {v24.16b}, [x0], #16\n"
        "ld1 {v9.16b}, [x0], #16\n"
        "ld1 {v17.16b}, [x0], #16\n"
        "ld1 {v25.16b}, [x0], #16\n"
        "ld1 {v10.16b}, [x0], #16\n"
        "ld1 {v18.16b}, [x0], #16\n"
        "ld1 {v26.16b}, [x0], #16\n"
        "ld1 {v11.16b}, [x0], #16\n"
        "ld1 {v19.16b}, [x0], #16\n"
        "ld1 {v27.16b}, [x0], #16\n"
        "ld1 {v12.16b}, [x0], #16\n"
        "ld1 {v20.16b}, [x0], #16\n"
        "ld1 {v28.16b}, [x0], #16\n"
        "ld1 {v13.16b}, [x0], #16\n"
        "ld1 {v21.16b}, [x0], #16\n"
        "ld1 {v29.16b}, [x0], #16\n"
        "ld1 {v14.16b}, [x0], #16\n"
        "ld1 {v22.16b}, [x0], #16\n"
        "ld1 {v30.16b}, [x0], #16\n"
        "ld1 {v15.16b}, [x0], #16\n"
        "ld1 {v23.16b}, [x0], #16\n"
        "ld1 {v31.16b}, [x0], #16\n"

        GEMMLOWP_LABEL_LOOP
        ":\n"

        // Load 2 Rhs cell of size 1x4 each
        "ld1 {v5.4s}, [%[rhs_ptr]], #16\n"
        "ld1 {v6.4s}, [%[rhs_ptr]], #16\n"

        // Load 3 Lhs cells of size 4x1 each
        "ld1 {v2.4s}, [%[lhs_ptr]], #16\n"
        "ld1 {v3.4s}, [%[lhs_ptr]], #16\n"
        "ld1 {v4.4s}, [%[lhs_ptr]], #16\n"

        // Multiply-accumulate
        "dup v0.4s, v5.s[0]\n"
        "dup v1.4s, v5.s[1]\n"
        "fmla v8.4s, v2.4s, v0.4s\n"
        "fmla v16.4s, v3.4s, v0.4s\n"
        "fmla v24.4s, v4.4s, v0.4s\n"
        "fmla v9.4s, v2.4s, v1.4s\n"
        "fmla v17.4s, v3.4s, v1.4s\n"
        "fmla v25.4s, v4.4s, v1.4s\n"
        "dup v0.4s, v5.s[2]\n"
        "dup v1.4s, v5.s[3]\n"
        "fmla v10.4s, v2.4s, v0.4s\n"
        "fmla v18.4s, v3.4s, v0.4s\n"
        "fmla v26.4s, v4.4s, v0.4s\n"
        "fmla v11.4s, v2.4s, v1.4s\n"
        "fmla v19.4s, v3.4s, v1.4s\n"
        "fmla v27.4s, v4.4s, v1.4s\n"
        "dup v0.4s, v6.s[0]\n"
        "dup v1.4s, v6.s[1]\n"
        "fmla v12.4s, v2.4s, v0.4s\n"
        "fmla v20.4s, v3.4s, v0.4s\n"
        "fmla v28.4s, v4.4s, v0.4s\n"
        "fmla v13.4s, v2.4s, v1.4s\n"
        "fmla v21.4s, v3.4s, v1.4s\n"
        "fmla v29.4s, v4.4s, v1.4s\n"
        "dup v0.4s, v6.s[2]\n"
        "dup v1.4s, v6.s[3]\n"
        "fmla v14.4s, v2.4s, v0.4s\n"
        "fmla v22.4s, v3.4s, v0.4s\n"
        "fmla v30.4s, v4.4s, v0.4s\n"
        "fmla v15.4s, v2.4s, v1.4s\n"
        "fmla v23.4s, v3.4s, v1.4s\n"
        "fmla v31.4s, v4.4s, v1.4s\n"

        // Loop. Decrement loop index (depth) by 1, since we just handled 1
        // level of depth.
        "subs %w[depth], %w[depth], #1\n"
        "bne " GEMMLOWP_LABEL_LOOP
        "b\n"

        // Store accumulators
        "mov x0, %[accum_ptr]\n"
        "st1 {v8.16b}, [x0], #16\n"
        "st1 {v16.16b}, [x0], #16\n"
        "st1 {v24.16b}, [x0], #16\n"
        "st1 {v9.16b}, [x0], #16\n"
        "st1 {v17.16b}, [x0], #16\n"
        "st1 {v25.16b}, [x0], #16\n"
        "st1 {v10.16b}, [x0], #16\n"
        "st1 {v18.16b}, [x0], #16\n"
        "st1 {v26.16b}, [x0], #16\n"
        "st1 {v11.16b}, [x0], #16\n"
        "st1 {v19.16b}, [x0], #16\n"
        "st1 {v27.16b}, [x0], #16\n"
        "st1 {v12.16b}, [x0], #16\n"
        "st1 {v20.16b}, [x0], #16\n"
        "st1 {v28.16b}, [x0], #16\n"
        "st1 {v13.16b}, [x0], #16\n"
        "st1 {v21.16b}, [x0], #16\n"
        "st1 {v29.16b}, [x0], #16\n"
        "st1 {v14.16b}, [x0], #16\n"
        "st1 {v22.16b}, [x0], #16\n"
        "st1 {v30.16b}, [x0], #16\n"
        "st1 {v15.16b}, [x0], #16\n"
        "st1 {v23.16b}, [x0], #16\n"
        "st1 {v31.16b}, [x0], #16\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [depth] "+r"(depth)
        :  // inputs
        [accum_ptr] "r"(accum_ptr)
        :  // clobbers
        "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
        "v28", "v29", "v30", "v31");
  }
};

// This is the "most natural" kernel, using NEON multiply-with-scalar
// instructions.
struct NEON_64bit_GEMM_Float32_WithScalar {
  typedef float OperandType;
  typedef float AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 2> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    asm volatile(
        // Load accumulators
        "mov x0, %[accum_ptr]\n"
        "ld1 {v8.16b}, [x0], #16\n"
        "ld1 {v16.16b}, [x0], #16\n"
        "ld1 {v24.16b}, [x0], #16\n"
        "ld1 {v9.16b}, [x0], #16\n"
        "ld1 {v17.16b}, [x0], #16\n"
        "ld1 {v25.16b}, [x0], #16\n"
        "ld1 {v10.16b}, [x0], #16\n"
        "ld1 {v18.16b}, [x0], #16\n"
        "ld1 {v26.16b}, [x0], #16\n"
        "ld1 {v11.16b}, [x0], #16\n"
        "ld1 {v19.16b}, [x0], #16\n"
        "ld1 {v27.16b}, [x0], #16\n"
        "ld1 {v12.16b}, [x0], #16\n"
        "ld1 {v20.16b}, [x0], #16\n"
        "ld1 {v28.16b}, [x0], #16\n"
        "ld1 {v13.16b}, [x0], #16\n"
        "ld1 {v21.16b}, [x0], #16\n"
        "ld1 {v29.16b}, [x0], #16\n"
        "ld1 {v14.16b}, [x0], #16\n"
        "ld1 {v22.16b}, [x0], #16\n"
        "ld1 {v30.16b}, [x0], #16\n"
        "ld1 {v15.16b}, [x0], #16\n"
        "ld1 {v23.16b}, [x0], #16\n"
        "ld1 {v31.16b}, [x0], #16\n"

        GEMMLOWP_LABEL_LOOP
        ":\n"

        // Load 2 Rhs cell of size 1x4 each
        "ld1 {v0.4s}, [%[rhs_ptr]], #16\n"
        "ld1 {v1.4s}, [%[rhs_ptr]], #16\n"

        // Load 3 Lhs cells of size 4x1 each
        "ld1 {v2.4s}, [%[lhs_ptr]], #16\n"
        "ld1 {v3.4s}, [%[lhs_ptr]], #16\n"
        "ld1 {v4.4s}, [%[lhs_ptr]], #16\n"

        // Multiply-accumulate
        "fmla v8.4s, v2.4s, v0.s[0]\n"
        "fmla v9.4s, v2.4s, v0.s[1]\n"
        "fmla v10.4s, v2.4s, v0.s[2]\n"
        "fmla v11.4s, v2.4s, v0.s[3]\n"
        "fmla v12.4s, v2.4s, v1.s[0]\n"
        "fmla v13.4s, v2.4s, v1.s[1]\n"
        "fmla v14.4s, v2.4s, v1.s[2]\n"
        "fmla v15.4s, v2.4s, v1.s[3]\n"
        "fmla v16.4s, v3.4s, v0.s[0]\n"
        "fmla v17.4s, v3.4s, v0.s[1]\n"
        "fmla v18.4s, v3.4s, v0.s[2]\n"
        "fmla v19.4s, v3.4s, v0.s[3]\n"
        "fmla v20.4s, v3.4s, v1.s[0]\n"
        "fmla v21.4s, v3.4s, v1.s[1]\n"
        "fmla v22.4s, v3.4s, v1.s[2]\n"
        "fmla v23.4s, v3.4s, v1.s[3]\n"
        "fmla v24.4s, v4.4s, v0.s[0]\n"
        "fmla v25.4s, v4.4s, v0.s[1]\n"
        "fmla v26.4s, v4.4s, v0.s[2]\n"
        "fmla v27.4s, v4.4s, v0.s[3]\n"
        "fmla v28.4s, v4.4s, v1.s[0]\n"
        "fmla v29.4s, v4.4s, v1.s[1]\n"
        "fmla v30.4s, v4.4s, v1.s[2]\n"
        "fmla v31.4s, v4.4s, v1.s[3]\n"

        // Loop. Decrement loop index (depth) by 1, since we just handled 1
        // level of depth.
        "subs %w[depth], %w[depth], #1\n"
        "bne " GEMMLOWP_LABEL_LOOP
        "b\n"

        // Store accumulators
        "mov x0, %[accum_ptr]\n"
        "st1 {v8.16b}, [x0], #16\n"
        "st1 {v16.16b}, [x0], #16\n"
        "st1 {v24.16b}, [x0], #16\n"
        "st1 {v9.16b}, [x0], #16\n"
        "st1 {v17.16b}, [x0], #16\n"
        "st1 {v25.16b}, [x0], #16\n"
        "st1 {v10.16b}, [x0], #16\n"
        "st1 {v18.16b}, [x0], #16\n"
        "st1 {v26.16b}, [x0], #16\n"
        "st1 {v11.16b}, [x0], #16\n"
        "st1 {v19.16b}, [x0], #16\n"
        "st1 {v27.16b}, [x0], #16\n"
        "st1 {v12.16b}, [x0], #16\n"
        "st1 {v20.16b}, [x0], #16\n"
        "st1 {v28.16b}, [x0], #16\n"
        "st1 {v13.16b}, [x0], #16\n"
        "st1 {v21.16b}, [x0], #16\n"
        "st1 {v29.16b}, [x0], #16\n"
        "st1 {v14.16b}, [x0], #16\n"
        "st1 {v22.16b}, [x0], #16\n"
        "st1 {v30.16b}, [x0], #16\n"
        "st1 {v15.16b}, [x0], #16\n"
        "st1 {v23.16b}, [x0], #16\n"
        "st1 {v31.16b}, [x0], #16\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [depth] "+r"(depth)
        :  // inputs
        [accum_ptr] "r"(accum_ptr)
        :  // clobbers
        "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
        "v28", "v29", "v30", "v31");
  }
};

// Faster kernel contributed by ARM. Tuned for A57.
struct NEON_64bit_GEMM_Float32_WithScalar_A57 {
  typedef float OperandType;
  typedef float AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 2> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    asm volatile(
        // Load accumulators
        "mov x0, %[accum_ptr]\n"
        "ld1 {v8.16b}, [x0], #16\n"
        "ld1 {v16.16b}, [x0], #16\n"
        "ld1 {v24.16b}, [x0], #16\n"
        "ld1 {v9.16b}, [x0], #16\n"
        "ld1 {v17.16b}, [x0], #16\n"
        "ld1 {v25.16b}, [x0], #16\n"
        "ld1 {v10.16b}, [x0], #16\n"
        "ld1 {v18.16b}, [x0], #16\n"
        "ld1 {v26.16b}, [x0], #16\n"
        "ld1 {v11.16b}, [x0], #16\n"
        "ld1 {v19.16b}, [x0], #16\n"
        "ld1 {v27.16b}, [x0], #16\n"
        "ld1 {v12.16b}, [x0], #16\n"
        "ld1 {v20.16b}, [x0], #16\n"
        "ld1 {v28.16b}, [x0], #16\n"
        "ld1 {v13.16b}, [x0], #16\n"
        "ld1 {v21.16b}, [x0], #16\n"
        "ld1 {v29.16b}, [x0], #16\n"
        "ld1 {v14.16b}, [x0], #16\n"
        "ld1 {v22.16b}, [x0], #16\n"
        "ld1 {v30.16b}, [x0], #16\n"
        "ld1 {v15.16b}, [x0], #16\n"
        "ld1 {v23.16b}, [x0], #16\n"
        "ld1 {v31.16b}, [x0], #16\n"

        // The start of the loop assumes first Rhs cell is already loaded, so
        // do it here for first iteration.
        "ld1 {v0.4s}, [%[rhs_ptr]], #16\n"

        // And the same for the first Lhs cell.
        "ld1 {v2.4s}, [%[lhs_ptr]], #16\n"

        GEMMLOWP_LABEL_LOOP
        ":\n"

        // Start the MACs at the head of the loop - 1st cell from each side
        // already loaded.
        "fmla v8.4s, v2.4s, v0.s[0]\n"
        "fmla v9.4s, v2.4s, v0.s[1]\n"
        "ld1 {v1.4s}, [%[rhs_ptr]], #16\n"  // Load second Rhs cell.
        "fmla v10.4s, v2.4s, v0.s[2]\n"
        "fmla v11.4s, v2.4s, v0.s[3]\n"
        "ld1 {v3.4s}, [%[lhs_ptr]], #16\n"  // Load second Lhs cell.
        "fmla v12.4s, v2.4s, v1.s[0]\n"
        "fmla v13.4s, v2.4s, v1.s[1]\n"
        "ld1 {v4.4s}, [%[lhs_ptr]], #16\n"  // Load third Lhs cell.
        "fmla v14.4s, v2.4s, v1.s[2]\n"
        "fmla v15.4s, v2.4s, v1.s[3]\n"
        "ld1 {v2.4s}, [%[lhs_ptr]], #16\n"  // Done with first Lhs cell - load
        // for the next iteration early.
        "fmla v16.4s, v3.4s, v0.s[0]\n"
        "fmla v17.4s, v3.4s, v0.s[1]\n"
        "fmla v18.4s, v3.4s, v0.s[2]\n"
        "fmla v19.4s, v3.4s, v0.s[3]\n"
        "fmla v20.4s, v3.4s, v1.s[0]\n"
        "fmla v21.4s, v3.4s, v1.s[1]\n"
        "fmla v22.4s, v3.4s, v1.s[2]\n"
        "fmla v23.4s, v3.4s, v1.s[3]\n"
        "fmla v24.4s, v4.4s, v0.s[0]\n"
        "fmla v25.4s, v4.4s, v0.s[1]\n"
        "fmla v26.4s, v4.4s, v0.s[2]\n"
        "fmla v27.4s, v4.4s, v0.s[3]\n"
        "ld1 {v0.4s}, [%[rhs_ptr]], #16\n"  // Done with the first Rhs cell -
        // load for the next iteration
        // early.
        "fmla v28.4s, v4.4s, v1.s[0]\n"
        "fmla v29.4s, v4.4s, v1.s[1]\n"
        // Loop. Decrement loop index (depth) by 1, since we just handled
        // 1 level of depth.  Do this a bit before the end of the loop for
        // better dispatch on A57.
        "subs %w[depth], %w[depth], #1\n"
        "fmla v30.4s, v4.4s, v1.s[2]\n"
        "fmla v31.4s, v4.4s, v1.s[3]\n"

        "bne " GEMMLOWP_LABEL_LOOP
        "b\n"

        // Store accumulators
        "mov x0, %[accum_ptr]\n"
        "st1 {v8.16b}, [x0], #16\n"
        "st1 {v16.16b}, [x0], #16\n"
        "st1 {v24.16b}, [x0], #16\n"
        "st1 {v9.16b}, [x0], #16\n"
        "st1 {v17.16b}, [x0], #16\n"
        "st1 {v25.16b}, [x0], #16\n"
        "st1 {v10.16b}, [x0], #16\n"
        "st1 {v18.16b}, [x0], #16\n"
        "st1 {v26.16b}, [x0], #16\n"
        "st1 {v11.16b}, [x0], #16\n"
        "st1 {v19.16b}, [x0], #16\n"
        "st1 {v27.16b}, [x0], #16\n"
        "st1 {v12.16b}, [x0], #16\n"
        "st1 {v20.16b}, [x0], #16\n"
        "st1 {v28.16b}, [x0], #16\n"
        "st1 {v13.16b}, [x0], #16\n"
        "st1 {v21.16b}, [x0], #16\n"
        "st1 {v29.16b}, [x0], #16\n"
        "st1 {v14.16b}, [x0], #16\n"
        "st1 {v22.16b}, [x0], #16\n"
        "st1 {v30.16b}, [x0], #16\n"
        "st1 {v15.16b}, [x0], #16\n"
        "st1 {v23.16b}, [x0], #16\n"
        "st1 {v31.16b}, [x0], #16\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [depth] "+r"(depth)
        :  // inputs
        [accum_ptr] "r"(accum_ptr)
        :  // clobbers
        "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
        "v28", "v29", "v30", "v31");
  }
};

#ifndef __APPLE__
// Faster kernel contributed by ARM. Tuned for A53.
struct NEON_64bit_GEMM_Float32_WithScalar_A53 {
  typedef float OperandType;
  typedef float AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 2> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    asm volatile(
        // Load accumulators
        "mov x0, %[accum_ptr]\n"
        "ld1 {v8.16b}, [x0], #16\n"
        "ld1 {v16.16b}, [x0], #16\n"
        "ld1 {v24.16b}, [x0], #16\n"
        "ld1 {v9.16b}, [x0], #16\n"
        "ld1 {v17.16b}, [x0], #16\n"
        "ld1 {v25.16b}, [x0], #16\n"
        "ld1 {v10.16b}, [x0], #16\n"
        "ld1 {v18.16b}, [x0], #16\n"
        "ld1 {v26.16b}, [x0], #16\n"
        "ld1 {v11.16b}, [x0], #16\n"
        "ld1 {v19.16b}, [x0], #16\n"
        "ld1 {v27.16b}, [x0], #16\n"
        "ld1 {v12.16b}, [x0], #16\n"
        "ld1 {v20.16b}, [x0], #16\n"
        "ld1 {v28.16b}, [x0], #16\n"
        "ld1 {v13.16b}, [x0], #16\n"
        "ld1 {v21.16b}, [x0], #16\n"
        "ld1 {v29.16b}, [x0], #16\n"
        "ld1 {v14.16b}, [x0], #16\n"
        "ld1 {v22.16b}, [x0], #16\n"
        "ld1 {v30.16b}, [x0], #16\n"
        "ld1 {v15.16b}, [x0], #16\n"
        "ld1 {v23.16b}, [x0], #16\n"
        "ld1 {v31.16b}, [x0], #16\n"

        // For A53, a very different-looking loop is needed.
        //
        // The main reason for this is that on A53 128-bit loads take two
        // cycles during which no dual issue can occur.  Doing two separate
        // 64-bit loads avoids this issue - they each take one cycle and are
        // able to dual issue.  Since vector register loads don't dual issue
        // with FMLA, we load half the register as normal and the other half
        // into an integer register.  This second half can then be moved into
        // place later with an INS instruction - which will dual issue with a
        // later FP load.
        //
        // For this kernel there are approximately 3 times as many multiplies
        // as loads, so it makes sense to structure the loop into blocks of 4
        // cycles, with 1 dedicated "load cycle" and 3 "multiply cycles" per
        // block.  Strictly preserving this structure with NOPs where no load
        // is needed seems to result in higher performance.
        //
        // Choice of x18 to store the upper halves on their way into the
        // vector registers is arbitrary.  Added to the clobber list so that
        // the compiler will make it available.
        //
        //
        // At the start of the loop, it is assumed that v0 is "half loaded" -
        // bottom half in place in d0 and the upper half in x18 ready to
        // insert.  So set that up here for the first iteration:
        "ldr d0, [%[rhs_ptr]]\n"             // Bottom half of first Rhs cell
        "ldr x18, [%[rhs_ptr], #8]\n"        // Upper half
        "add %[rhs_ptr], %[rhs_ptr], #16\n"  // Separate increment (needed as
        // there is no operation to load at
        // reg + 8 but then increment reg
        // by 16).

        // v2 should be fully loaded - as it's outside the loop proper it's fine
        // to use a 128-bit load here.
        "ld1 {v2.4s}, [%[lhs_ptr]], #16\n"  // first Lhs cell

        GEMMLOWP_LABEL_LOOP
        ":\n"

        // First block of four cycles.  Multplies all require v2 and v0; v2 is
        // loaded earlier and v0 is half loaded and completed in the load
        // cycle at the start.
        "ldr d1, [%[rhs_ptr]]\n"  // "load" cycle - loading bottom half of v1
        // (second Rhs cell).
        "ins v0.d[1], x18\n"  // "load" cycle - moving the upper half of v0 into
        // place.
        "fmla v8.4s, v2.4s, v0.s[0]\n"  // "fmla" cycle 1 - first multiply.
        "ldr x18, [%[rhs_ptr], #8]\n"  // "fmla" cycle 1 - load upper half of v1
        // into x18.
        "fmla v9.4s, v2.4s, v0.s[1]\n"       // "fmla" cycle 2 - second multiply
        "add %[rhs_ptr], %[rhs_ptr], #16\n"  // "fmla" cycle 2 - increment Rhs
        // pointer (if needed)
        "fmla v10.4s, v2.4s, v0.s[2]\n"  // "fmla" cycle 3 - third multiply.  No
        // more work to dual issue.

        // Second block.  Start loading v3 (second Lhs cell), finish loading v1.
        "ldr d3, [%[lhs_ptr]]\n"
        "ins v1.d[1], x18\n"  // v1 ready here.
        "fmla v11.4s, v2.4s, v0.s[3]\n"
        "ldr x18, [%[lhs_ptr], #8]\n"
        "fmla v12.4s, v2.4s, v1.s[0]\n"  // First use of v1.
        "add %[lhs_ptr], %[lhs_ptr], #16\n"
        "fmla v13.4s, v2.4s, v1.s[1]\n"

        // Third block.  Start loading v4 (third Lhs cell), finish loading v3.
        "ldr d4, [%[lhs_ptr]]\n"
        "ins v3.d[1], x18\n"  // v3 ready here.
        "fmla v14.4s, v2.4s, v1.s[2]\n"
        "ldr x18, [%[lhs_ptr], #8]\n"
        "fmla v15.4s, v2.4s, v1.s[3]\n"
        "add %[lhs_ptr], %[lhs_ptr], #16\n"
        "fmla v16.4s, v3.4s, v0.s[0]\n"  // First use of v3.

        // Fourth block.  v2 (first Lhs cell) is now finished with, so start
        // loading value for next iteration.  Finish loading v4.
        "ldr d2, [%[lhs_ptr]]\n"
        "ins v4.d[1], x18\n"  // v4 ready here.
        "fmla v17.4s, v3.4s, v0.s[1]\n"
        "ldr x18, [%[lhs_ptr], #8]\n"
        "fmla v18.4s, v3.4s, v0.s[2]\n"
        "add %[lhs_ptr], %[lhs_ptr], #16\n"
        "fmla v19.4s, v3.4s, v0.s[3]\n"

        // Fifth block, finish loading v2.  No new load to start as the other
        // registers are all still live.
        "ins v2.d[1], x18\n"
        "fmla v20.4s, v3.4s, v1.s[0]\n"
        "fmla v21.4s, v3.4s, v1.s[1]\n"
        "fmla v22.4s, v3.4s, v1.s[2]\n"

        // Sixth block, nothing to load.  2 nops needed as a single nop would
        // dual issue with the FMLA and break the timing.
        "nop\n"
        "nop\n"
        "fmla v23.4s, v3.4s, v1.s[3]\n"
        "fmla v24.4s, v4.4s, v0.s[0]\n"  // First use of v4.
        "fmla v25.4s, v4.4s, v0.s[1]\n"

        // Seventh block, nothing to load.  Decrement the loop counter in this
        // block as the last block is very full.
        "nop\n"
        "nop\n"
        "fmla v26.4s, v4.4s, v0.s[2]\n"
        "subs %w[depth], %w[depth], #1\n"
        "fmla v27.4s, v4.4s, v0.s[3]\n"
        "fmla v28.4s, v4.4s, v1.s[0]\n"

        // Eighth block - start loading v0 for next iteration.
        "ldr d0, [%[rhs_ptr]]\n"
        "fmla v29.4s, v4.4s, v1.s[1]\n"
        "ldr x18, [%[rhs_ptr], #8]\n"
        "fmla v30.4s, v4.4s, v1.s[2]\n"
        "add %[rhs_ptr], %[rhs_ptr], #16\n"
        "fmla v31.4s, v4.4s, v1.s[3]\n"

        // Loop branch.  This will dual issue in fmla cycle 3 of the 8th block.
        "bne " GEMMLOWP_LABEL_LOOP
        "b\n"

        // Store accumulators
        "mov x0, %[accum_ptr]\n"
        "st1 {v8.16b}, [x0], #16\n"
        "st1 {v16.16b}, [x0], #16\n"
        "st1 {v24.16b}, [x0], #16\n"
        "st1 {v9.16b}, [x0], #16\n"
        "st1 {v17.16b}, [x0], #16\n"
        "st1 {v25.16b}, [x0], #16\n"
        "st1 {v10.16b}, [x0], #16\n"
        "st1 {v18.16b}, [x0], #16\n"
        "st1 {v26.16b}, [x0], #16\n"
        "st1 {v11.16b}, [x0], #16\n"
        "st1 {v19.16b}, [x0], #16\n"
        "st1 {v27.16b}, [x0], #16\n"
        "st1 {v12.16b}, [x0], #16\n"
        "st1 {v20.16b}, [x0], #16\n"
        "st1 {v28.16b}, [x0], #16\n"
        "st1 {v13.16b}, [x0], #16\n"
        "st1 {v21.16b}, [x0], #16\n"
        "st1 {v29.16b}, [x0], #16\n"
        "st1 {v14.16b}, [x0], #16\n"
        "st1 {v22.16b}, [x0], #16\n"
        "st1 {v30.16b}, [x0], #16\n"
        "st1 {v15.16b}, [x0], #16\n"
        "st1 {v23.16b}, [x0], #16\n"
        "st1 {v31.16b}, [x0], #16\n"
        :  // outputs
        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
        [depth] "+r"(depth)
        :  // inputs
        [accum_ptr] "r"(accum_ptr)
        :  // clobbers
        "cc", "memory", "x0", "x18", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
        "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
        "v27", "v28", "v29", "v30", "v31");
  }
};
#endif

#endif  // __aarch64__

#ifndef __aarch64__
inline int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
  const int32x2_t c = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
  const int32x2_t d = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
  return vcombine_s32(c, d);
}
#endif

// C++ intrinsics-based variant of the deep, int8, fast kernel
template <int Cols>
struct NEON_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics {
  typedef std::int8_t OperandType;
  typedef std::int32_t AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
      KernelSideFormat<CellFormat<Cols, 16, CellOrder::WidthMajor>, 1> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    int32x4_t acc[4][Cols];
    for (int i = 0; i < 4; i++) {
      for (int j = 0; j < Cols; j++) {
        acc[i][j] = vdupq_n_s32(0);
      }
    }
    for (int d = 0; d < depth; d += 16) {
      int8x16_t lhs[4];
      for (int i = 0; i < 4; i++) {
        lhs[i] = vld1q_s8(lhs_ptr + 16 * i);
      }
      int8x16_t rhs[Cols];
      for (int i = 0; i < Cols; i++) {
        rhs[i] = vld1q_s8(rhs_ptr + 16 * i);
      }
      for (int i = 0; i < 4; i++) {
        for (int j = 0; j < Cols; j++) {
          int16x8_t local_acc =
              vmull_s8(vget_low_s8(lhs[i]), vget_low_s8(rhs[j]));
          local_acc =
              vmlal_s8(local_acc, vget_high_s8(lhs[i]), vget_high_s8(rhs[j]));
          acc[i][j] = vpadalq_s16(acc[i][j], local_acc);
        }
      }
      lhs_ptr += 64;
      rhs_ptr += 16 * Cols;
    }
    for (int i = 0; i < Cols; i++) {
      int32x4_t acc_2x_0 = vpaddq_s32(acc[0][i], acc[1][i]);
      int32x4_t acc_2x_1 = vpaddq_s32(acc[2][i], acc[3][i]);
      int32x4_t acc_4x = vpaddq_s32(acc_2x_0, acc_2x_1);
      int32x4_t dst_val = vld1q_s32(accum_ptr + 4 * i);
      dst_val = vaddq_s32(dst_val, acc_4x);
      vst1q_s32(accum_ptr + 4 * i, dst_val);
    }
  }
};

using NEON_64bit_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics =
    NEON_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics<4>;

using NEON_32bit_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics =
    NEON_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics<2>;

// C++ intrinsics-based variant of the wide, uint8, general kernel
template <int RhsCells>
struct NEON_GEMM_Uint8Operands_Uint32Accumulators_intrinsics {
  typedef std::uint8_t OperandType;
  typedef std::int32_t AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, 3>,
      KernelSideFormat<CellFormat<4, 2, CellOrder::DepthMajor>, RhsCells> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    int32x4_t acc[3][4 * RhsCells];
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 4 * RhsCells; j++) {
        acc[i][j] = vld1q_s32(accum_ptr + 4 * (i + 3 * j));
      }
    }
    for (int d = 0; d < depth; d += 2) {
      int16x8_t lhs[3];
      for (int i = 0; i < 3; i++) {
        lhs[i] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(lhs_ptr + 8 * i)));
      }
      int16x8_t rhs[RhsCells];
      for (int i = 0; i < RhsCells; i++) {
        rhs[i] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(rhs_ptr + 8 * i)));
      }
      for (int i = 0; i < 3; i++) {
        for (int j = 0; j < RhsCells; j++) {
          acc[i][4 * j + 0] = vmlal_lane_s16(
              acc[i][4 * j + 0], vget_low_s16(lhs[i]), vget_low_s16(rhs[j]), 0);
          acc[i][4 * j + 1] = vmlal_lane_s16(
              acc[i][4 * j + 1], vget_low_s16(lhs[i]), vget_low_s16(rhs[j]), 1);
          acc[i][4 * j + 2] = vmlal_lane_s16(
              acc[i][4 * j + 2], vget_low_s16(lhs[i]), vget_low_s16(rhs[j]), 2);
          acc[i][4 * j + 3] = vmlal_lane_s16(
              acc[i][4 * j + 3], vget_low_s16(lhs[i]), vget_low_s16(rhs[j]), 3);
          acc[i][4 * j + 0] =
              vmlal_lane_s16(acc[i][4 * j + 0], vget_high_s16(lhs[i]),
                             vget_high_s16(rhs[j]), 0);
          acc[i][4 * j + 1] =
              vmlal_lane_s16(acc[i][4 * j + 1], vget_high_s16(lhs[i]),
                             vget_high_s16(rhs[j]), 1);
          acc[i][4 * j + 2] =
              vmlal_lane_s16(acc[i][4 * j + 2], vget_high_s16(lhs[i]),
                             vget_high_s16(rhs[j]), 2);
          acc[i][4 * j + 3] =
              vmlal_lane_s16(acc[i][4 * j + 3], vget_high_s16(lhs[i]),
                             vget_high_s16(rhs[j]), 3);
        }
      }
      lhs_ptr += 24;
      rhs_ptr += 8 * RhsCells;
    }
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 4 * RhsCells; j++) {
        vst1q_s32(accum_ptr + 4 * (i + 3 * j), acc[i][j]);
      }
    }
  }
};

using NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators_intrinsics =
    NEON_GEMM_Uint8Operands_Uint32Accumulators_intrinsics<1>;

using NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_intrinsics =
    NEON_GEMM_Uint8Operands_Uint32Accumulators_intrinsics<2>;

template <int RhsCells>
struct NEON_GEMM_Float32_WithScalar_intrinsics {
  typedef float OperandType;
  typedef float AccumulatorType;
  typedef KernelFormat<
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, 3>,
      KernelSideFormat<CellFormat<4, 1, CellOrder::DepthMajor>, RhsCells> >
      Format;
  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    float32x4_t acc[3][4 * RhsCells];
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 4 * RhsCells; j++) {
        acc[i][j] = vld1q_f32(accum_ptr + 4 * (i + 3 * j));
      }
    }
    for (int d = 0; d < depth; d++) {
      float32x4_t lhs[3];
      for (int i = 0; i < 3; i++) {
        lhs[i] = vld1q_f32(lhs_ptr + 4 * i);
      }
      float32x4_t rhs[RhsCells];
      for (int i = 0; i < RhsCells; i++) {
        rhs[i] = vld1q_f32(rhs_ptr + 4 * i);
      }
      for (int i = 0; i < 3; i++) {
        for (int j = 0; j < RhsCells; j++) {
          acc[i][4 * j + 0] = vmlaq_lane_f32(acc[i][4 * j + 0], lhs[i],
                                             vget_low_f32(rhs[j]), 0);
          acc[i][4 * j + 1] = vmlaq_lane_f32(acc[i][4 * j + 1], lhs[i],
                                             vget_low_f32(rhs[j]), 1);
          acc[i][4 * j + 2] = vmlaq_lane_f32(acc[i][4 * j + 2], lhs[i],
                                             vget_high_f32(rhs[j]), 0);
          acc[i][4 * j + 3] = vmlaq_lane_f32(acc[i][4 * j + 3], lhs[i],
                                             vget_high_f32(rhs[j]), 1);
        }
      }
      lhs_ptr += 12;
      rhs_ptr += 4 * RhsCells;
    }
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 4 * RhsCells; j++) {
        vst1q_f32(accum_ptr + 4 * (i + 3 * j), acc[i][j]);
      }
    }
  }
};

using NEON_32bit_GEMM_Float32_WithScalar_intrinsics =
    NEON_GEMM_Float32_WithScalar_intrinsics<1>;

using NEON_64bit_GEMM_Float32_WithScalar_intrinsics =
    NEON_GEMM_Float32_WithScalar_intrinsics<2>;

// BEGIN code copied from gemmlowp/internal/kernel_reference.h

// This kernel is templatized in an arbitrary Format template parameter,
// allowing it to have any arbitrary format.
template <typename tOperandType, typename tAccumulatorType, typename tFormat>
struct ReferenceKernel {
  typedef tOperandType OperandType;
  typedef tAccumulatorType AccumulatorType;
  typedef tFormat Format;

  static void Run(const OperandType* lhs_ptr, const OperandType* rhs_ptr,
                  AccumulatorType* accum_ptr, int depth) {
    const int depth_cells = static_cast<int>(depth / Format::kDepth);

    // The outer loop is over the depth dimension.
    for (int dc = 0; dc < depth_cells; dc++) {
      // The next two loops are over cells of the Lhs (stacked vertically),
      // and over cells of the Rhs (stacked horizontally).
      for (int rc = 0; rc < Format::Lhs::kCells; rc++) {
        const OperandType* lhs_cell_ptr =
            lhs_ptr + (dc * Format::Lhs::kCells + rc) *
                          Format::Lhs::Cell::kWidth * Format::kDepth;
        for (int cc = 0; cc < Format::Rhs::kCells; cc++) {
          const OperandType* rhs_cell_ptr =
              rhs_ptr + (dc * Format::Rhs::kCells + cc) *
                            Format::Rhs::Cell::kWidth * Format::kDepth;

          // Now we are inside one cell of the Lhs and inside one cell
          // of the Rhs, so the remaining inner loops are just
          // traditional three loops of matrix multiplication.
          for (int di = 0; di < Format::kDepth; di++) {
            for (int ri = 0; ri < Format::Lhs::Cell::kWidth; ri++) {
              for (int ci = 0; ci < Format::Rhs::Cell::kWidth; ci++) {
                const OperandType* lhs_coeff_ptr =
                    lhs_cell_ptr +
                    OffsetIntoCell<typename Format::Lhs::Cell>(ri, di);
                const OperandType* rhs_coeff_ptr =
                    rhs_cell_ptr +
                    OffsetIntoCell<typename Format::Rhs::Cell>(ci, di);
                AccumulatorType* accumulator_coeff_ptr =
                    accum_ptr + (ri + rc * Format::Lhs::Cell::kWidth) +
                    (ci + cc * Format::Rhs::Cell::kWidth) * Format::kRows;
                *accumulator_coeff_ptr += AccumulatorType(*lhs_coeff_ptr) *
                                          AccumulatorType(*rhs_coeff_ptr);
              }
            }
          }
        }
      }
    }
  }
};

// END code copied from gemmlowp/internal/kernel_reference.h

template <typename DataType>
class CacheLineAlignedBuffer {
 public:
  CacheLineAlignedBuffer(std::size_t size) : size_(size) {
    data_ = nullptr;
    // Adds a few bytes of padding here, because the 64-bit 'A57' kernel
    // reads one iteration past the end the buffer, causing a crash on iOS.
    posix_memalign(reinterpret_cast<void**>(&data_), kCacheLineSize,
                   size_ * sizeof(DataType) + 16);
  }

  ~CacheLineAlignedBuffer() { free(data_); }

  const DataType* data() const { return data_; }
  DataType* data() { return data_; }

  const std::size_t size() const { return size_; }

 private:
  const std::size_t size_;
  DataType* data_;
};

template <typename DataType>
void FillRandom(CacheLineAlignedBuffer<DataType>* buffer) {
  static std::mt19937 generator(0);
  // 100 is smaller than any nonzero bound of the range of any data type.
  const DataType kMaxVal = DataType(100);
  const DataType kMinVal =
      std::is_signed<DataType>::value ? -kMaxVal : DataType(0);
  std::uniform_real_distribution<float> dist(kMinVal, kMaxVal);
  for (std::size_t i = 0; i < buffer->size(); i++) {
    buffer->data()[i] = DataType(dist(generator));
  }
}

template <typename DataType>
void FillZero(CacheLineAlignedBuffer<DataType>* buffer) {
  for (std::size_t i = 0; i < buffer->size(); i++) {
    buffer->data()[i] = DataType(0);
  }
}

template <typename DataType>
void Copy(CacheLineAlignedBuffer<DataType>* dst,
          const CacheLineAlignedBuffer<DataType>& src) {
  assert(dst->size() == src.size());
  memcpy(dst->data(), src.data(), src.size() * sizeof(DataType));
}

template <typename DataType>
void PrintMatrix(int rows, int cols, int rowstride, int colstride,
                 const DataType* data) {
  for (int r = 0; r < rows; r++) {
    for (int c = 0; c < cols; c++) {
      std::cerr << double(data[r * rowstride + c * colstride]) << " ";
    }
    std::cerr << std::endl;
  }
  std::cerr << std::endl;
}

template <typename DataType>
bool approx_equals(DataType a, DataType b) {
  return a == b;
}

template <>
bool approx_equals(float a, float b) {
  if (!a && !b) {
    return true;
  }
  // 1e-1 is very coarse accuracy, we should switch to an overall L2 metric
  // and tighten the tolerance on that metric.
  return std::abs(a - b) < 1e-1f * std::min(std::abs(a), std::abs(b));
}

template <typename Kernel>
void test_kernel(int depth, const char* kernel_name) {
  typedef typename Kernel::OperandType OperandType;
  typedef typename Kernel::AccumulatorType AccumulatorType;
  typedef typename Kernel::Format Format;
  static const int kLhsWidth = Format::Lhs::kWidth;
  static const int kRhsWidth = Format::Rhs::kWidth;

  typedef ReferenceKernel<OperandType, AccumulatorType, Format> ReferenceKernel;

  CacheLineAlignedBuffer<OperandType> lhs(kLhsWidth * depth);
  CacheLineAlignedBuffer<OperandType> rhs(kRhsWidth * depth);
  CacheLineAlignedBuffer<AccumulatorType> accum_initial(kLhsWidth * kRhsWidth);
  CacheLineAlignedBuffer<AccumulatorType> accum(kLhsWidth * kRhsWidth);
  CacheLineAlignedBuffer<AccumulatorType> accum_reference(kLhsWidth *
                                                          kRhsWidth);

  FillRandom(&lhs);
  FillRandom(&rhs);
  FillRandom(&accum_initial);
  Copy(&accum, accum_initial);
  Copy(&accum_reference, accum_initial);

  ReferenceKernel::Run(lhs.data(), rhs.data(), accum_reference.data(), depth);
  Kernel::Run(lhs.data(), rhs.data(), accum.data(), depth);

  for (int l = 0; l < kLhsWidth; l++) {
    for (int r = 0; r < kRhsWidth; r++) {
      const int index = l + kLhsWidth * r;
      if (!approx_equals(accum.data()[index], accum_reference.data()[index])) {
        std::cerr << "Arithmetic error in kernel:" << std::endl
                  << "    " << kernel_name << std::endl
                  << "Wrong accumulator for depth=" << depth << ", "
                  << "at l = " << l << ", r = " << r << std::endl;
        std::cerr << "reference value: " << accum_reference.data()[index]
                  << std::endl;
        std::cerr << "actual value:    " << accum.data()[index] << std::endl;
        if (depth <= 16) {
          std::cerr << "LHS matrix:" << std::endl;
          PrintMatrix(kLhsWidth, depth, 1, kLhsWidth, lhs.data());
          std::cerr << "RHS matrix:" << std::endl;
          PrintMatrix(depth, kRhsWidth, kRhsWidth, 1, rhs.data());
          std::cerr << "Initial Accumulator matrix:" << std::endl;
          PrintMatrix(kLhsWidth, kRhsWidth, 1, kLhsWidth, accum_initial.data());
          std::cerr << "Reference Accumulator matrix:" << std::endl;
          PrintMatrix(kLhsWidth, kRhsWidth, 1, kLhsWidth,
                      accum_reference.data());
          std::cerr << "Actual Accumulator matrix:" << std::endl;
          PrintMatrix(kLhsWidth, kRhsWidth, 1, kLhsWidth, accum.data());
        }
        abort();
      }
    }
  }
}

template <typename Kernel>
int ops(int depth) {
  // 2x the number of multiply-accumulate scalar ops.
  return 2 * Kernel::Format::Lhs::kWidth * Kernel::Format::Rhs::kWidth * depth;
}

template <unsigned Modulus, typename Integer>
Integer RoundDown(Integer i) {
  return i - (i % Modulus);
}

int CacheSizeInKB() {
  static const char* cache_size_k_env = getenv("CACHE_SIZE_KB");
  static const int cache_size_k =
      cache_size_k_env ? atoi(cache_size_k_env) : kDefaultCacheSizeK;
  return cache_size_k;
}

template <typename Kernel>
int BenchmarkDepthToFitInCache() {
  const int cache_size_bytes = 1024 * CacheSizeInKB();

  // Subtract the typical size of a few cache lines, so
  // we don't need to worry too hard about e.g. some stack data.
  const int conservative_cache_size_bytes =
      cache_size_bytes - 2 * kCacheLineSize;

  // We will subtract the memory occupied by accumulators.
  typedef typename Kernel::AccumulatorType AccumulatorType;
  const int kAccumulatorBytes = sizeof(AccumulatorType) *
                                Kernel::Format::Lhs::kWidth *
                                Kernel::Format::Rhs::kWidth;

  // Compute the depth.
  typedef typename Kernel::OperandType OperandType;
  const int kBytesPerUnitOfDepth =
      sizeof(OperandType) *
      (Kernel::Format::Lhs::kWidth + Kernel::Format::Rhs::kWidth);
  const int unrounded_depth =
      (conservative_cache_size_bytes - kAccumulatorBytes) /
      kBytesPerUnitOfDepth;

  // Cap depth, to avoid unfairly favoring narrower kernels
  const int kMaxDepth = 1024;
  const int clamped_unrounded_depth = std::min(kMaxDepth, unrounded_depth);

  // Round depth down to a multiple of cache line size, which helps because
  // our kernels may crash if depth is not a multiple of the number of
  // depth level that they want to
  // handle at each loop iteration, and we don't want to require kernels
  // to be more complex. Currently all kernels process 1, 2 or 8 levels of
  // depth at a time. The main reason why that might increase in the future
  // is if registers get wider, but I don't suppose that register could
  // ever get wider than cache lines.
  return RoundDown<kCacheLineSize>(clamped_unrounded_depth);
}

double current_time_in_seconds() {
  timespec t;
  clock_gettime(CLOCK_REALTIME, &t);
  return t.tv_sec + 1e-9 * t.tv_nsec;
}

template <typename Kernel>
double benchmark(int depth) {
  // Minimum duration for this benchmark to run. If the workload finishes
  // sooner, we retry with double the number of iterations.
  static const double min_benchmark_time_in_seconds = 1.0;

  typedef typename Kernel::OperandType OperandType;
  typedef typename Kernel::AccumulatorType AccumulatorType;

  CacheLineAlignedBuffer<OperandType> lhs(Kernel::Format::Lhs::kWidth * depth);
  CacheLineAlignedBuffer<OperandType> rhs(Kernel::Format::Rhs::kWidth * depth);
  CacheLineAlignedBuffer<AccumulatorType> accum(Kernel::Format::Lhs::kWidth *
                                                Kernel::Format::Rhs::kWidth);

  for (std::uint64_t iters_at_a_time = 1;; iters_at_a_time *= 2) {
    const double t_start = current_time_in_seconds();
    for (std::uint64_t i = 0; i < iters_at_a_time; i++) {
      Kernel::Run(lhs.data(), rhs.data(), accum.data(), depth);
    }
    const double t_end = current_time_in_seconds();
    const double elapsed = t_end - t_start;
    if (elapsed > min_benchmark_time_in_seconds) {
      return iters_at_a_time * ops<Kernel>(depth) / elapsed;
    }
  }
}

template <typename Kernel>
void benchmark_and_print_results(const char* kernel_name) {
  if (getenv("BENCHMARK_KERNEL")) {
    if (strcmp(getenv("BENCHMARK_KERNEL"), kernel_name)) {
      return;
    }
  }
  const int kKernelDepth = Kernel::Format::kDepth;
  for (int depth = kKernelDepth; depth <= 1024; depth += kKernelDepth) {
    test_kernel<Kernel>(depth, kernel_name);
  }

  if (getenv("BENCHMARK_ALL_DEPTHS")) {
    for (int depth = kKernelDepth;
         depth <= BenchmarkDepthToFitInCache<Kernel>(); depth *= 2) {
      std::cout << kernel_name << "," << depth << ","
                << benchmark<Kernel>(depth) * 1e-9f << std::endl;
    }
  } else {
    const int depth = BenchmarkDepthToFitInCache<Kernel>();
    std::cout << kernel_name << "," << benchmark<Kernel>(depth) * 1e-9f
              << std::endl;
  }
}

#define BENCHMARK(Kernel)                         \
  do {                                            \
    benchmark_and_print_results<Kernel>(#Kernel); \
  } while (false)

int main() {
  if (getenv("BENCHMARK_ALL_DEPTHS")) {
    std::cout << "kernel,depth,Gop/s" << std::endl;
  } else {
    std::cout << "kernel,Gop/s" << std::endl;
  }

#ifdef __arm__
  BENCHMARK(NEON_32bit_GEMM_Int8Operands_AccumTwoWithin16Bits);
  BENCHMARK(NEON_32bit_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics);
  BENCHMARK(NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators);
  BENCHMARK(NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators_intrinsics);
  BENCHMARK(NEON_32bit_GEMM_Uint8Operands_Uint32Accumulators_noexpand);
  BENCHMARK(NEON_32bit_GEMM_Int32_WithScalar);
  BENCHMARK(NEON_32bit_GEMM_Float32_MLA_WithVectorDuplicatingScalar);
#ifdef __ARM_FEATURE_FMA
  BENCHMARK(NEON_32bit_GEMM_Float32_FMA_WithVectorDuplicatingScalar);
#endif
  BENCHMARK(NEON_32bit_GEMM_Float32_MLA_WithScalar);
  BENCHMARK(NEON_32bit_GEMM_Float32_WithScalar_intrinsics);
  BENCHMARK(NEON_32bit_GEMM_Float32_WithScalar_A53);
  BENCHMARK(NEON_32bit_GEMM_Float32_WithScalar_A53_depth2);
  BENCHMARK(NEON_32bit_GEMM_Float32_MLA_Rotating);
#ifdef __ARM_FEATURE_FMA
  BENCHMARK(NEON_32bit_GEMM_Float32_FMA_Rotating);
#endif
#endif

#ifdef __aarch64__

  BENCHMARK(NEON_64bit_GEMM_Int8Operands_AccumTwoWithin16Bits);
  BENCHMARK(NEON_64bit_GEMM_Int8Operands_AccumTwoWithin16Bits_intrinsics);
  BENCHMARK(NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators);
  BENCHMARK(NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_intrinsics);
  BENCHMARK(NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_noexpand_A57);
  BENCHMARK(NEON_64bit_GEMM_Int32_WithScalar);
  BENCHMARK(NEON_64bit_GEMM_Float32_WithVectorDuplicatingScalar);
  BENCHMARK(NEON_64bit_GEMM_Float32_WithScalar);
  BENCHMARK(NEON_64bit_GEMM_Float32_WithScalar_intrinsics);
  BENCHMARK(NEON_64bit_GEMM_Float32_WithScalar_A57);
#ifndef __APPLE__
  BENCHMARK(NEON_64bit_GEMM_Float32_WithScalar_A53);
#endif
#endif

  return 0;
}