mirror of
https://github.com/ioacademy-jikim/debugging
synced 2025-12-10 07:29:39 +00:00
1613 lines
43 KiB
C
1613 lines
43 KiB
C
|
|
/*---------------------------------------------------------------*/
|
|
/*--- begin host_generic_simd64.c ---*/
|
|
/*---------------------------------------------------------------*/
|
|
|
|
/*
|
|
This file is part of Valgrind, a dynamic binary instrumentation
|
|
framework.
|
|
|
|
Copyright (C) 2004-2015 OpenWorks LLP
|
|
info@open-works.net
|
|
|
|
This program is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU General Public License as
|
|
published by the Free Software Foundation; either version 2 of the
|
|
License, or (at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful, but
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
|
02110-1301, USA.
|
|
|
|
The GNU General Public License is contained in the file COPYING.
|
|
|
|
Neither the names of the U.S. Department of Energy nor the
|
|
University of California nor the names of its contributors may be
|
|
used to endorse or promote products derived from this software
|
|
without prior written permission.
|
|
*/
|
|
|
|
/* Generic helper functions for doing 64-bit SIMD arithmetic in cases
|
|
where the instruction selectors cannot generate code in-line.
|
|
These are purely back-end entities and cannot be seen/referenced
|
|
from IR. There are also helpers for 32-bit arithmetic in here. */
|
|
|
|
#include "libvex_basictypes.h"
|
|
#include "main_util.h" // LIKELY, UNLIKELY
|
|
#include "host_generic_simd64.h"
|
|
|
|
|
|
|
|
/* Tuple/select functions for 32x2 vectors. */
|
|
|
|
static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
|
|
return (((ULong)w1) << 32) | ((ULong)w0);
|
|
}
|
|
|
|
static inline UInt sel32x2_1 ( ULong w64 ) {
|
|
return 0xFFFFFFFF & toUInt(w64 >> 32);
|
|
}
|
|
static inline UInt sel32x2_0 ( ULong w64 ) {
|
|
return 0xFFFFFFFF & toUInt(w64);
|
|
}
|
|
|
|
|
|
/* Tuple/select functions for 16x4 vectors. gcc is pretty hopeless
|
|
with 64-bit shifts so we give it a hand. */
|
|
|
|
static inline ULong mk16x4 ( UShort w3, UShort w2,
|
|
UShort w1, UShort w0 ) {
|
|
UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
|
|
UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
|
|
return mk32x2(hi32, lo32);
|
|
}
|
|
|
|
static inline UShort sel16x4_3 ( ULong w64 ) {
|
|
UInt hi32 = toUInt(w64 >> 32);
|
|
return toUShort(0xFFFF & (hi32 >> 16));
|
|
}
|
|
static inline UShort sel16x4_2 ( ULong w64 ) {
|
|
UInt hi32 = toUInt(w64 >> 32);
|
|
return toUShort(0xFFFF & hi32);
|
|
}
|
|
static inline UShort sel16x4_1 ( ULong w64 ) {
|
|
UInt lo32 = (UInt)w64;
|
|
return toUShort(0xFFFF & (lo32 >> 16));
|
|
}
|
|
static inline UShort sel16x4_0 ( ULong w64 ) {
|
|
UInt lo32 = (UInt)w64;
|
|
return toUShort(0xFFFF & lo32);
|
|
}
|
|
|
|
|
|
/* Tuple/select functions for 8x8 vectors. */
|
|
|
|
static inline ULong mk8x8 ( UChar w7, UChar w6,
|
|
UChar w5, UChar w4,
|
|
UChar w3, UChar w2,
|
|
UChar w1, UChar w0 ) {
|
|
UInt hi32 = (((UInt)w7) << 24) | (((UInt)w6) << 16)
|
|
| (((UInt)w5) << 8) | (((UInt)w4) << 0);
|
|
UInt lo32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
|
|
| (((UInt)w1) << 8) | (((UInt)w0) << 0);
|
|
return mk32x2(hi32, lo32);
|
|
}
|
|
|
|
static inline UChar sel8x8_7 ( ULong w64 ) {
|
|
UInt hi32 = toUInt(w64 >> 32);
|
|
return toUChar(0xFF & (hi32 >> 24));
|
|
}
|
|
static inline UChar sel8x8_6 ( ULong w64 ) {
|
|
UInt hi32 = toUInt(w64 >> 32);
|
|
return toUChar(0xFF & (hi32 >> 16));
|
|
}
|
|
static inline UChar sel8x8_5 ( ULong w64 ) {
|
|
UInt hi32 = toUInt(w64 >> 32);
|
|
return toUChar(0xFF & (hi32 >> 8));
|
|
}
|
|
static inline UChar sel8x8_4 ( ULong w64 ) {
|
|
UInt hi32 = toUInt(w64 >> 32);
|
|
return toUChar(0xFF & (hi32 >> 0));
|
|
}
|
|
static inline UChar sel8x8_3 ( ULong w64 ) {
|
|
UInt lo32 = (UInt)w64;
|
|
return toUChar(0xFF & (lo32 >> 24));
|
|
}
|
|
static inline UChar sel8x8_2 ( ULong w64 ) {
|
|
UInt lo32 = (UInt)w64;
|
|
return toUChar(0xFF & (lo32 >> 16));
|
|
}
|
|
static inline UChar sel8x8_1 ( ULong w64 ) {
|
|
UInt lo32 = (UInt)w64;
|
|
return toUChar(0xFF & (lo32 >> 8));
|
|
}
|
|
static inline UChar sel8x8_0 ( ULong w64 ) {
|
|
UInt lo32 = (UInt)w64;
|
|
return toUChar(0xFF & (lo32 >> 0));
|
|
}
|
|
|
|
static inline UChar index8x8 ( ULong w64, UChar ix ) {
|
|
ix &= 7;
|
|
return toUChar((w64 >> (8*ix)) & 0xFF);
|
|
}
|
|
|
|
|
|
/* Scalar helpers. */
|
|
|
|
static inline Int qadd32S ( Int xx, Int yy )
|
|
{
|
|
Long t = ((Long)xx) + ((Long)yy);
|
|
const Long loLim = -0x80000000LL;
|
|
const Long hiLim = 0x7FFFFFFFLL;
|
|
if (t < loLim) t = loLim;
|
|
if (t > hiLim) t = hiLim;
|
|
return (Int)t;
|
|
}
|
|
|
|
static inline Short qadd16S ( Short xx, Short yy )
|
|
{
|
|
Int t = ((Int)xx) + ((Int)yy);
|
|
if (t < -32768) t = -32768;
|
|
if (t > 32767) t = 32767;
|
|
return (Short)t;
|
|
}
|
|
|
|
static inline Char qadd8S ( Char xx, Char yy )
|
|
{
|
|
Int t = ((Int)xx) + ((Int)yy);
|
|
if (t < -128) t = -128;
|
|
if (t > 127) t = 127;
|
|
return (Char)t;
|
|
}
|
|
|
|
static inline UShort qadd16U ( UShort xx, UShort yy )
|
|
{
|
|
UInt t = ((UInt)xx) + ((UInt)yy);
|
|
if (t > 0xFFFF) t = 0xFFFF;
|
|
return (UShort)t;
|
|
}
|
|
|
|
static inline UChar qadd8U ( UChar xx, UChar yy )
|
|
{
|
|
UInt t = ((UInt)xx) + ((UInt)yy);
|
|
if (t > 0xFF) t = 0xFF;
|
|
return (UChar)t;
|
|
}
|
|
|
|
static inline Int qsub32S ( Int xx, Int yy )
|
|
{
|
|
Long t = ((Long)xx) - ((Long)yy);
|
|
const Long loLim = -0x80000000LL;
|
|
const Long hiLim = 0x7FFFFFFFLL;
|
|
if (t < loLim) t = loLim;
|
|
if (t > hiLim) t = hiLim;
|
|
return (Int)t;
|
|
}
|
|
|
|
static inline Short qsub16S ( Short xx, Short yy )
|
|
{
|
|
Int t = ((Int)xx) - ((Int)yy);
|
|
if (t < -32768) t = -32768;
|
|
if (t > 32767) t = 32767;
|
|
return (Short)t;
|
|
}
|
|
|
|
static inline Char qsub8S ( Char xx, Char yy )
|
|
{
|
|
Int t = ((Int)xx) - ((Int)yy);
|
|
if (t < -128) t = -128;
|
|
if (t > 127) t = 127;
|
|
return (Char)t;
|
|
}
|
|
|
|
static inline UShort qsub16U ( UShort xx, UShort yy )
|
|
{
|
|
Int t = ((Int)xx) - ((Int)yy);
|
|
if (t < 0) t = 0;
|
|
if (t > 0xFFFF) t = 0xFFFF;
|
|
return (UShort)t;
|
|
}
|
|
|
|
static inline UChar qsub8U ( UChar xx, UChar yy )
|
|
{
|
|
Int t = ((Int)xx) - ((Int)yy);
|
|
if (t < 0) t = 0;
|
|
if (t > 0xFF) t = 0xFF;
|
|
return (UChar)t;
|
|
}
|
|
|
|
static inline Short mul16 ( Short xx, Short yy )
|
|
{
|
|
Int t = ((Int)xx) * ((Int)yy);
|
|
return (Short)t;
|
|
}
|
|
|
|
static inline Int mul32 ( Int xx, Int yy )
|
|
{
|
|
Int t = ((Int)xx) * ((Int)yy);
|
|
return (Int)t;
|
|
}
|
|
|
|
static inline Short mulhi16S ( Short xx, Short yy )
|
|
{
|
|
Int t = ((Int)xx) * ((Int)yy);
|
|
t >>=/*s*/ 16;
|
|
return (Short)t;
|
|
}
|
|
|
|
static inline UShort mulhi16U ( UShort xx, UShort yy )
|
|
{
|
|
UInt t = ((UInt)xx) * ((UInt)yy);
|
|
t >>=/*u*/ 16;
|
|
return (UShort)t;
|
|
}
|
|
|
|
static inline UInt cmpeq32 ( UInt xx, UInt yy )
|
|
{
|
|
return xx==yy ? 0xFFFFFFFF : 0;
|
|
}
|
|
|
|
static inline UShort cmpeq16 ( UShort xx, UShort yy )
|
|
{
|
|
return toUShort(xx==yy ? 0xFFFF : 0);
|
|
}
|
|
|
|
static inline UChar cmpeq8 ( UChar xx, UChar yy )
|
|
{
|
|
return toUChar(xx==yy ? 0xFF : 0);
|
|
}
|
|
|
|
static inline UInt cmpgt32S ( Int xx, Int yy )
|
|
{
|
|
return xx>yy ? 0xFFFFFFFF : 0;
|
|
}
|
|
|
|
static inline UShort cmpgt16S ( Short xx, Short yy )
|
|
{
|
|
return toUShort(xx>yy ? 0xFFFF : 0);
|
|
}
|
|
|
|
static inline UChar cmpgt8S ( Char xx, Char yy )
|
|
{
|
|
return toUChar(xx>yy ? 0xFF : 0);
|
|
}
|
|
|
|
static inline UInt cmpnez32 ( UInt xx )
|
|
{
|
|
return xx==0 ? 0 : 0xFFFFFFFF;
|
|
}
|
|
|
|
static inline UShort cmpnez16 ( UShort xx )
|
|
{
|
|
return toUShort(xx==0 ? 0 : 0xFFFF);
|
|
}
|
|
|
|
static inline UChar cmpnez8 ( UChar xx )
|
|
{
|
|
return toUChar(xx==0 ? 0 : 0xFF);
|
|
}
|
|
|
|
static inline Short qnarrow32Sto16S ( UInt xx0 )
|
|
{
|
|
Int xx = (Int)xx0;
|
|
if (xx < -32768) xx = -32768;
|
|
if (xx > 32767) xx = 32767;
|
|
return (Short)xx;
|
|
}
|
|
|
|
static inline Char qnarrow16Sto8S ( UShort xx0 )
|
|
{
|
|
Short xx = (Short)xx0;
|
|
if (xx < -128) xx = -128;
|
|
if (xx > 127) xx = 127;
|
|
return (Char)xx;
|
|
}
|
|
|
|
static inline UChar qnarrow16Sto8U ( UShort xx0 )
|
|
{
|
|
Short xx = (Short)xx0;
|
|
if (xx < 0) xx = 0;
|
|
if (xx > 255) xx = 255;
|
|
return (UChar)xx;
|
|
}
|
|
|
|
static inline UShort narrow32to16 ( UInt xx )
|
|
{
|
|
return (UShort)xx;
|
|
}
|
|
|
|
static inline UChar narrow16to8 ( UShort xx )
|
|
{
|
|
return (UChar)xx;
|
|
}
|
|
|
|
/* shifts: we don't care about out-of-range ones, since
|
|
that is dealt with at a higher level. */
|
|
|
|
static inline UChar shl8 ( UChar v, UInt n )
|
|
{
|
|
return toUChar(v << n);
|
|
}
|
|
|
|
static inline UChar sar8 ( UChar v, UInt n )
|
|
{
|
|
return toUChar(((Char)v) >> n);
|
|
}
|
|
|
|
static inline UShort shl16 ( UShort v, UInt n )
|
|
{
|
|
return toUShort(v << n);
|
|
}
|
|
|
|
static inline UShort shr16 ( UShort v, UInt n )
|
|
{
|
|
return toUShort((((UShort)v) >> n));
|
|
}
|
|
|
|
static inline UShort sar16 ( UShort v, UInt n )
|
|
{
|
|
return toUShort(((Short)v) >> n);
|
|
}
|
|
|
|
static inline UInt shl32 ( UInt v, UInt n )
|
|
{
|
|
return v << n;
|
|
}
|
|
|
|
static inline UInt shr32 ( UInt v, UInt n )
|
|
{
|
|
return (((UInt)v) >> n);
|
|
}
|
|
|
|
static inline UInt sar32 ( UInt v, UInt n )
|
|
{
|
|
return ((Int)v) >> n;
|
|
}
|
|
|
|
static inline UChar avg8U ( UChar xx, UChar yy )
|
|
{
|
|
UInt xxi = (UInt)xx;
|
|
UInt yyi = (UInt)yy;
|
|
UInt r = (xxi + yyi + 1) >> 1;
|
|
return (UChar)r;
|
|
}
|
|
|
|
static inline UShort avg16U ( UShort xx, UShort yy )
|
|
{
|
|
UInt xxi = (UInt)xx;
|
|
UInt yyi = (UInt)yy;
|
|
UInt r = (xxi + yyi + 1) >> 1;
|
|
return (UShort)r;
|
|
}
|
|
|
|
static inline Short max16S ( Short xx, Short yy )
|
|
{
|
|
return toUShort((xx > yy) ? xx : yy);
|
|
}
|
|
|
|
static inline UChar max8U ( UChar xx, UChar yy )
|
|
{
|
|
return toUChar((xx > yy) ? xx : yy);
|
|
}
|
|
|
|
static inline Short min16S ( Short xx, Short yy )
|
|
{
|
|
return toUShort((xx < yy) ? xx : yy);
|
|
}
|
|
|
|
static inline UChar min8U ( UChar xx, UChar yy )
|
|
{
|
|
return toUChar((xx < yy) ? xx : yy);
|
|
}
|
|
|
|
static inline UShort hadd16U ( UShort xx, UShort yy )
|
|
{
|
|
UInt xxi = (UInt)xx;
|
|
UInt yyi = (UInt)yy;
|
|
UInt r = (xxi + yyi) >> 1;
|
|
return (UShort)r;
|
|
}
|
|
|
|
static inline Short hadd16S ( Short xx, Short yy )
|
|
{
|
|
Int xxi = (Int)xx;
|
|
Int yyi = (Int)yy;
|
|
Int r = (xxi + yyi) >> 1;
|
|
return (Short)r;
|
|
}
|
|
|
|
static inline UShort hsub16U ( UShort xx, UShort yy )
|
|
{
|
|
UInt xxi = (UInt)xx;
|
|
UInt yyi = (UInt)yy;
|
|
UInt r = (xxi - yyi) >> 1;
|
|
return (UShort)r;
|
|
}
|
|
|
|
static inline Short hsub16S ( Short xx, Short yy )
|
|
{
|
|
Int xxi = (Int)xx;
|
|
Int yyi = (Int)yy;
|
|
Int r = (xxi - yyi) >> 1;
|
|
return (Short)r;
|
|
}
|
|
|
|
static inline UChar hadd8U ( UChar xx, UChar yy )
|
|
{
|
|
UInt xxi = (UInt)xx;
|
|
UInt yyi = (UInt)yy;
|
|
UInt r = (xxi + yyi) >> 1;
|
|
return (UChar)r;
|
|
}
|
|
|
|
static inline Char hadd8S ( Char xx, Char yy )
|
|
{
|
|
Int xxi = (Int)xx;
|
|
Int yyi = (Int)yy;
|
|
Int r = (xxi + yyi) >> 1;
|
|
return (Char)r;
|
|
}
|
|
|
|
static inline UChar hsub8U ( UChar xx, UChar yy )
|
|
{
|
|
UInt xxi = (UInt)xx;
|
|
UInt yyi = (UInt)yy;
|
|
UInt r = (xxi - yyi) >> 1;
|
|
return (UChar)r;
|
|
}
|
|
|
|
static inline Char hsub8S ( Char xx, Char yy )
|
|
{
|
|
Int xxi = (Int)xx;
|
|
Int yyi = (Int)yy;
|
|
Int r = (xxi - yyi) >> 1;
|
|
return (Char)r;
|
|
}
|
|
|
|
static inline UInt absdiff8U ( UChar xx, UChar yy )
|
|
{
|
|
UInt xxu = (UChar)xx;
|
|
UInt yyu = (UChar)yy;
|
|
return xxu >= yyu ? xxu - yyu : yyu - xxu;
|
|
}
|
|
|
|
/* ----------------------------------------------------- */
|
|
/* Start of the externally visible functions. These simply
|
|
implement the corresponding IR primops. */
|
|
/* ----------------------------------------------------- */
|
|
|
|
/* ------------ Normal addition ------------ */
|
|
|
|
ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
|
|
{
|
|
return mk32x2(
|
|
sel32x2_1(xx) + sel32x2_1(yy),
|
|
sel32x2_0(xx) + sel32x2_0(yy)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
|
|
toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
|
|
toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
|
|
toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
|
|
toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
|
|
toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
|
|
toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
|
|
toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
|
|
toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
|
|
toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
|
|
toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
/* ------------ Saturating addition ------------ */
|
|
|
|
ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
|
|
qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
|
|
qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
|
|
qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
|
|
qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
|
|
qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
|
|
qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
|
|
qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
|
|
qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
|
|
qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
|
|
qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
|
|
qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
|
|
qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
|
|
qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
|
|
qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
/* ------------ Normal subtraction ------------ */
|
|
|
|
ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
|
|
{
|
|
return mk32x2(
|
|
sel32x2_1(xx) - sel32x2_1(yy),
|
|
sel32x2_0(xx) - sel32x2_0(yy)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
|
|
toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
|
|
toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
|
|
toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
|
|
toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
|
|
toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
|
|
toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
|
|
toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
|
|
toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
|
|
toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
|
|
toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
/* ------------ Saturating subtraction ------------ */
|
|
|
|
ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
|
|
qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
|
|
qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
|
|
qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
|
|
qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
|
|
qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
|
|
qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
|
|
qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
|
|
qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
|
|
qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
|
|
qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
|
|
qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
|
|
qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
|
|
qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
|
|
qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
/* ------------ Multiplication ------------ */
|
|
|
|
ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
mul16( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
mul16( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
mul16( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
mul16( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
|
|
{
|
|
return mk32x2(
|
|
mul32( sel32x2_1(xx), sel32x2_1(yy) ),
|
|
mul32( sel32x2_0(xx), sel32x2_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
/* ------------ Comparison ------------ */
|
|
|
|
ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
|
|
{
|
|
return mk32x2(
|
|
cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
|
|
cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
|
|
cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
|
|
cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
|
|
cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
|
|
cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
|
|
cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
|
|
cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
|
|
cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
|
|
{
|
|
return mk32x2(
|
|
cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
|
|
cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
|
|
cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
|
|
cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
|
|
cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
|
|
cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
|
|
cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
|
|
cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
|
|
cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
|
|
{
|
|
return mk32x2(
|
|
cmpnez32( sel32x2_1(xx) ),
|
|
cmpnez32( sel32x2_0(xx) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
|
|
{
|
|
return mk16x4(
|
|
cmpnez16( sel16x4_3(xx) ),
|
|
cmpnez16( sel16x4_2(xx) ),
|
|
cmpnez16( sel16x4_1(xx) ),
|
|
cmpnez16( sel16x4_0(xx) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
|
|
{
|
|
return mk8x8(
|
|
cmpnez8( sel8x8_7(xx) ),
|
|
cmpnez8( sel8x8_6(xx) ),
|
|
cmpnez8( sel8x8_5(xx) ),
|
|
cmpnez8( sel8x8_4(xx) ),
|
|
cmpnez8( sel8x8_3(xx) ),
|
|
cmpnez8( sel8x8_2(xx) ),
|
|
cmpnez8( sel8x8_1(xx) ),
|
|
cmpnez8( sel8x8_0(xx) )
|
|
);
|
|
}
|
|
|
|
/* ------------ Saturating narrowing ------------ */
|
|
|
|
ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
|
|
{
|
|
UInt d = sel32x2_1(aa);
|
|
UInt c = sel32x2_0(aa);
|
|
UInt b = sel32x2_1(bb);
|
|
UInt a = sel32x2_0(bb);
|
|
return mk16x4(
|
|
qnarrow32Sto16S(d),
|
|
qnarrow32Sto16S(c),
|
|
qnarrow32Sto16S(b),
|
|
qnarrow32Sto16S(a)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
|
|
{
|
|
UShort h = sel16x4_3(aa);
|
|
UShort g = sel16x4_2(aa);
|
|
UShort f = sel16x4_1(aa);
|
|
UShort e = sel16x4_0(aa);
|
|
UShort d = sel16x4_3(bb);
|
|
UShort c = sel16x4_2(bb);
|
|
UShort b = sel16x4_1(bb);
|
|
UShort a = sel16x4_0(bb);
|
|
return mk8x8(
|
|
qnarrow16Sto8S(h),
|
|
qnarrow16Sto8S(g),
|
|
qnarrow16Sto8S(f),
|
|
qnarrow16Sto8S(e),
|
|
qnarrow16Sto8S(d),
|
|
qnarrow16Sto8S(c),
|
|
qnarrow16Sto8S(b),
|
|
qnarrow16Sto8S(a)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
|
|
{
|
|
UShort h = sel16x4_3(aa);
|
|
UShort g = sel16x4_2(aa);
|
|
UShort f = sel16x4_1(aa);
|
|
UShort e = sel16x4_0(aa);
|
|
UShort d = sel16x4_3(bb);
|
|
UShort c = sel16x4_2(bb);
|
|
UShort b = sel16x4_1(bb);
|
|
UShort a = sel16x4_0(bb);
|
|
return mk8x8(
|
|
qnarrow16Sto8U(h),
|
|
qnarrow16Sto8U(g),
|
|
qnarrow16Sto8U(f),
|
|
qnarrow16Sto8U(e),
|
|
qnarrow16Sto8U(d),
|
|
qnarrow16Sto8U(c),
|
|
qnarrow16Sto8U(b),
|
|
qnarrow16Sto8U(a)
|
|
);
|
|
}
|
|
|
|
/* ------------ Truncating narrowing ------------ */
|
|
|
|
ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
|
|
{
|
|
UInt d = sel32x2_1(aa);
|
|
UInt c = sel32x2_0(aa);
|
|
UInt b = sel32x2_1(bb);
|
|
UInt a = sel32x2_0(bb);
|
|
return mk16x4(
|
|
narrow32to16(d),
|
|
narrow32to16(c),
|
|
narrow32to16(b),
|
|
narrow32to16(a)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
|
|
{
|
|
UShort h = sel16x4_3(aa);
|
|
UShort g = sel16x4_2(aa);
|
|
UShort f = sel16x4_1(aa);
|
|
UShort e = sel16x4_0(aa);
|
|
UShort d = sel16x4_3(bb);
|
|
UShort c = sel16x4_2(bb);
|
|
UShort b = sel16x4_1(bb);
|
|
UShort a = sel16x4_0(bb);
|
|
return mk8x8(
|
|
narrow16to8(h),
|
|
narrow16to8(g),
|
|
narrow16to8(f),
|
|
narrow16to8(e),
|
|
narrow16to8(d),
|
|
narrow16to8(c),
|
|
narrow16to8(b),
|
|
narrow16to8(a)
|
|
);
|
|
}
|
|
|
|
/* ------------ Interleaving ------------ */
|
|
|
|
ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
|
|
{
|
|
return mk8x8(
|
|
sel8x8_7(aa),
|
|
sel8x8_7(bb),
|
|
sel8x8_6(aa),
|
|
sel8x8_6(bb),
|
|
sel8x8_5(aa),
|
|
sel8x8_5(bb),
|
|
sel8x8_4(aa),
|
|
sel8x8_4(bb)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
|
|
{
|
|
return mk8x8(
|
|
sel8x8_3(aa),
|
|
sel8x8_3(bb),
|
|
sel8x8_2(aa),
|
|
sel8x8_2(bb),
|
|
sel8x8_1(aa),
|
|
sel8x8_1(bb),
|
|
sel8x8_0(aa),
|
|
sel8x8_0(bb)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
|
|
{
|
|
return mk16x4(
|
|
sel16x4_3(aa),
|
|
sel16x4_3(bb),
|
|
sel16x4_2(aa),
|
|
sel16x4_2(bb)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
|
|
{
|
|
return mk16x4(
|
|
sel16x4_1(aa),
|
|
sel16x4_1(bb),
|
|
sel16x4_0(aa),
|
|
sel16x4_0(bb)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
|
|
{
|
|
return mk32x2(
|
|
sel32x2_1(aa),
|
|
sel32x2_1(bb)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
|
|
{
|
|
return mk32x2(
|
|
sel32x2_0(aa),
|
|
sel32x2_0(bb)
|
|
);
|
|
}
|
|
|
|
/* ------------ Concatenation ------------ */
|
|
|
|
ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
|
|
{
|
|
return mk16x4(
|
|
sel16x4_3(aa),
|
|
sel16x4_1(aa),
|
|
sel16x4_3(bb),
|
|
sel16x4_1(bb)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
|
|
{
|
|
return mk16x4(
|
|
sel16x4_2(aa),
|
|
sel16x4_0(aa),
|
|
sel16x4_2(bb),
|
|
sel16x4_0(bb)
|
|
);
|
|
}
|
|
|
|
/* misc hack looking for a proper home */
|
|
ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
|
|
{
|
|
return mk8x8(
|
|
index8x8(aa, sel8x8_7(bb)),
|
|
index8x8(aa, sel8x8_6(bb)),
|
|
index8x8(aa, sel8x8_5(bb)),
|
|
index8x8(aa, sel8x8_4(bb)),
|
|
index8x8(aa, sel8x8_3(bb)),
|
|
index8x8(aa, sel8x8_2(bb)),
|
|
index8x8(aa, sel8x8_1(bb)),
|
|
index8x8(aa, sel8x8_0(bb))
|
|
);
|
|
}
|
|
|
|
/* ------------ Shifting ------------ */
|
|
/* Note that because these primops are undefined if the shift amount
|
|
equals or exceeds the lane width, the shift amount is masked so
|
|
that the scalar shifts are always in range. In fact, given the
|
|
semantics of these primops (ShlN16x4, etc) it is an error if in
|
|
fact we are ever given an out-of-range shift amount.
|
|
*/
|
|
ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
|
|
{
|
|
/* vassert(nn < 32); */
|
|
nn &= 31;
|
|
return mk32x2(
|
|
shl32( sel32x2_1(xx), nn ),
|
|
shl32( sel32x2_0(xx), nn )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
|
|
{
|
|
/* vassert(nn < 16); */
|
|
nn &= 15;
|
|
return mk16x4(
|
|
shl16( sel16x4_3(xx), nn ),
|
|
shl16( sel16x4_2(xx), nn ),
|
|
shl16( sel16x4_1(xx), nn ),
|
|
shl16( sel16x4_0(xx), nn )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_ShlN8x8 ( ULong xx, UInt nn )
|
|
{
|
|
/* vassert(nn < 8); */
|
|
nn &= 7;
|
|
return mk8x8(
|
|
shl8( sel8x8_7(xx), nn ),
|
|
shl8( sel8x8_6(xx), nn ),
|
|
shl8( sel8x8_5(xx), nn ),
|
|
shl8( sel8x8_4(xx), nn ),
|
|
shl8( sel8x8_3(xx), nn ),
|
|
shl8( sel8x8_2(xx), nn ),
|
|
shl8( sel8x8_1(xx), nn ),
|
|
shl8( sel8x8_0(xx), nn )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
|
|
{
|
|
/* vassert(nn < 32); */
|
|
nn &= 31;
|
|
return mk32x2(
|
|
shr32( sel32x2_1(xx), nn ),
|
|
shr32( sel32x2_0(xx), nn )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
|
|
{
|
|
/* vassert(nn < 16); */
|
|
nn &= 15;
|
|
return mk16x4(
|
|
shr16( sel16x4_3(xx), nn ),
|
|
shr16( sel16x4_2(xx), nn ),
|
|
shr16( sel16x4_1(xx), nn ),
|
|
shr16( sel16x4_0(xx), nn )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
|
|
{
|
|
/* vassert(nn < 32); */
|
|
nn &= 31;
|
|
return mk32x2(
|
|
sar32( sel32x2_1(xx), nn ),
|
|
sar32( sel32x2_0(xx), nn )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
|
|
{
|
|
/* vassert(nn < 16); */
|
|
nn &= 15;
|
|
return mk16x4(
|
|
sar16( sel16x4_3(xx), nn ),
|
|
sar16( sel16x4_2(xx), nn ),
|
|
sar16( sel16x4_1(xx), nn ),
|
|
sar16( sel16x4_0(xx), nn )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
|
|
{
|
|
/* vassert(nn < 8); */
|
|
nn &= 7;
|
|
return mk8x8(
|
|
sar8( sel8x8_7(xx), nn ),
|
|
sar8( sel8x8_6(xx), nn ),
|
|
sar8( sel8x8_5(xx), nn ),
|
|
sar8( sel8x8_4(xx), nn ),
|
|
sar8( sel8x8_3(xx), nn ),
|
|
sar8( sel8x8_2(xx), nn ),
|
|
sar8( sel8x8_1(xx), nn ),
|
|
sar8( sel8x8_0(xx), nn )
|
|
);
|
|
}
|
|
|
|
/* ------------ Averaging ------------ */
|
|
|
|
ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
|
|
avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
|
|
avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
|
|
avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
|
|
avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
|
|
avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
|
|
avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
|
|
avg8U( sel8x8_0(xx), sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
avg16U( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
/* ------------ max/min ------------ */
|
|
|
|
ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
max16S( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
max16S( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
max16S( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
max16S( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
max8U( sel8x8_7(xx), sel8x8_7(yy) ),
|
|
max8U( sel8x8_6(xx), sel8x8_6(yy) ),
|
|
max8U( sel8x8_5(xx), sel8x8_5(yy) ),
|
|
max8U( sel8x8_4(xx), sel8x8_4(yy) ),
|
|
max8U( sel8x8_3(xx), sel8x8_3(yy) ),
|
|
max8U( sel8x8_2(xx), sel8x8_2(yy) ),
|
|
max8U( sel8x8_1(xx), sel8x8_1(yy) ),
|
|
max8U( sel8x8_0(xx), sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
min16S( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
min16S( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
min16S( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
min16S( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
min8U( sel8x8_7(xx), sel8x8_7(yy) ),
|
|
min8U( sel8x8_6(xx), sel8x8_6(yy) ),
|
|
min8U( sel8x8_5(xx), sel8x8_5(yy) ),
|
|
min8U( sel8x8_4(xx), sel8x8_4(yy) ),
|
|
min8U( sel8x8_3(xx), sel8x8_3(yy) ),
|
|
min8U( sel8x8_2(xx), sel8x8_2(yy) ),
|
|
min8U( sel8x8_1(xx), sel8x8_1(yy) ),
|
|
min8U( sel8x8_0(xx), sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
UInt h_generic_calc_GetMSBs8x8 ( ULong xx )
|
|
{
|
|
UInt r = 0;
|
|
if (xx & (1ULL << (64-1))) r |= (1<<7);
|
|
if (xx & (1ULL << (56-1))) r |= (1<<6);
|
|
if (xx & (1ULL << (48-1))) r |= (1<<5);
|
|
if (xx & (1ULL << (40-1))) r |= (1<<4);
|
|
if (xx & (1ULL << (32-1))) r |= (1<<3);
|
|
if (xx & (1ULL << (24-1))) r |= (1<<2);
|
|
if (xx & (1ULL << (16-1))) r |= (1<<1);
|
|
if (xx & (1ULL << ( 8-1))) r |= (1<<0);
|
|
return r;
|
|
}
|
|
|
|
/* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
|
|
|
|
/* Tuple/select functions for 16x2 vectors. */
|
|
static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
|
|
return (((UInt)w1) << 16) | ((UInt)w2);
|
|
}
|
|
|
|
static inline UShort sel16x2_1 ( UInt w32 ) {
|
|
return 0xFFFF & (UShort)(w32 >> 16);
|
|
}
|
|
static inline UShort sel16x2_0 ( UInt w32 ) {
|
|
return 0xFFFF & (UShort)(w32);
|
|
}
|
|
|
|
static inline UInt mk8x4 ( UChar w3, UChar w2,
|
|
UChar w1, UChar w0 ) {
|
|
UInt w32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
|
|
| (((UInt)w1) << 8) | (((UInt)w0) << 0);
|
|
return w32;
|
|
}
|
|
|
|
static inline UChar sel8x4_3 ( UInt w32 ) {
|
|
return toUChar(0xFF & (w32 >> 24));
|
|
}
|
|
static inline UChar sel8x4_2 ( UInt w32 ) {
|
|
return toUChar(0xFF & (w32 >> 16));
|
|
}
|
|
static inline UChar sel8x4_1 ( UInt w32 ) {
|
|
return toUChar(0xFF & (w32 >> 8));
|
|
}
|
|
static inline UChar sel8x4_0 ( UInt w32 ) {
|
|
return toUChar(0xFF & (w32 >> 0));
|
|
}
|
|
|
|
|
|
/* ----------------------------------------------------- */
|
|
/* More externally visible functions. These simply
|
|
implement the corresponding IR primops. */
|
|
/* ----------------------------------------------------- */
|
|
|
|
/* ------ 16x2 ------ */
|
|
|
|
UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
|
|
{
|
|
return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
|
|
sel16x2_0(xx) + sel16x2_0(yy) );
|
|
}
|
|
|
|
UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
|
|
{
|
|
return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
|
|
sel16x2_0(xx) - sel16x2_0(yy) );
|
|
}
|
|
|
|
UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
|
|
{
|
|
return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
|
|
hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
|
|
}
|
|
|
|
UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
|
|
{
|
|
return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
|
|
hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
|
|
}
|
|
|
|
UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
|
|
{
|
|
return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
|
|
hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
|
|
}
|
|
|
|
UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
|
|
{
|
|
return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
|
|
hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
|
|
}
|
|
|
|
UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
|
|
{
|
|
return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
|
|
qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
|
|
}
|
|
|
|
UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
|
|
{
|
|
return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
|
|
qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
|
|
}
|
|
|
|
UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
|
|
{
|
|
return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
|
|
qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
|
|
}
|
|
|
|
UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
|
|
{
|
|
return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
|
|
qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
|
|
}
|
|
|
|
/* ------ 8x4 ------ */
|
|
|
|
UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
|
|
{
|
|
return mk8x4(
|
|
sel8x4_3(xx) + sel8x4_3(yy),
|
|
sel8x4_2(xx) + sel8x4_2(yy),
|
|
sel8x4_1(xx) + sel8x4_1(yy),
|
|
sel8x4_0(xx) + sel8x4_0(yy)
|
|
);
|
|
}
|
|
|
|
UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
|
|
{
|
|
return mk8x4(
|
|
sel8x4_3(xx) - sel8x4_3(yy),
|
|
sel8x4_2(xx) - sel8x4_2(yy),
|
|
sel8x4_1(xx) - sel8x4_1(yy),
|
|
sel8x4_0(xx) - sel8x4_0(yy)
|
|
);
|
|
}
|
|
|
|
UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
|
|
{
|
|
return mk8x4(
|
|
hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
|
|
hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
|
|
hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
|
|
hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
|
|
{
|
|
return mk8x4(
|
|
hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
|
|
hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
|
|
hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
|
|
hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
|
|
{
|
|
return mk8x4(
|
|
hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
|
|
hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
|
|
hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
|
|
hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
|
|
{
|
|
return mk8x4(
|
|
hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
|
|
hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
|
|
hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
|
|
hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
|
|
{
|
|
return mk8x4(
|
|
qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
|
|
qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
|
|
qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
|
|
qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
|
|
{
|
|
return mk8x4(
|
|
qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
|
|
qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
|
|
qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
|
|
qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
|
|
{
|
|
return mk8x4(
|
|
qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
|
|
qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
|
|
qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
|
|
qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
|
|
{
|
|
return mk8x4(
|
|
qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
|
|
qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
|
|
qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
|
|
qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
|
|
{
|
|
return mk16x2(
|
|
cmpnez16( sel16x2_1(xx) ),
|
|
cmpnez16( sel16x2_0(xx) )
|
|
);
|
|
}
|
|
|
|
UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
|
|
{
|
|
return mk8x4(
|
|
cmpnez8( sel8x4_3(xx) ),
|
|
cmpnez8( sel8x4_2(xx) ),
|
|
cmpnez8( sel8x4_1(xx) ),
|
|
cmpnez8( sel8x4_0(xx) )
|
|
);
|
|
}
|
|
|
|
UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
|
|
{
|
|
return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
|
|
+ absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
|
|
+ absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
|
|
+ absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
|
|
}
|
|
|
|
UInt h_generic_calc_QAdd32S ( UInt xx, UInt yy )
|
|
{
|
|
return qadd32S( xx, yy );
|
|
}
|
|
|
|
UInt h_generic_calc_QSub32S ( UInt xx, UInt yy )
|
|
{
|
|
return qsub32S( xx, yy );
|
|
}
|
|
|
|
|
|
/*------------------------------------------------------------------*/
|
|
/* Decimal Floating Point (DFP) externally visible helper functions */
|
|
/* that implement Iop_BCDtoDPB and Iop_DPBtoBCD */
|
|
/*------------------------------------------------------------------*/
|
|
|
|
#define NOT( x ) ( ( ( x ) == 0) ? 1 : 0)
|
|
#define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) )
|
|
#define PUT( x, y ) ( ( x )<< ( y ) )
|
|
|
|
static ULong dpb_to_bcd( ULong chunk )
|
|
{
|
|
Short a, b, c, d, e, f, g, h, i, j, k, m;
|
|
Short p, q, r, s, t, u, v, w, x, y;
|
|
ULong value;
|
|
|
|
/* convert 10 bit densely packed BCD to BCD */
|
|
p = GET( chunk, 9 );
|
|
q = GET( chunk, 8 );
|
|
r = GET( chunk, 7 );
|
|
s = GET( chunk, 6 );
|
|
t = GET( chunk, 5 );
|
|
u = GET( chunk, 4 );
|
|
v = GET( chunk, 3 );
|
|
w = GET( chunk, 2 );
|
|
x = GET( chunk, 1 );
|
|
y = GET( chunk, 0 );
|
|
|
|
/* The BCD bit values are given by the following boolean equations.*/
|
|
a = ( NOT(s) & v & w ) | ( t & v & w & s ) | ( v & w & NOT(x) );
|
|
b = ( p & s & x & NOT(t) ) | ( p & NOT(w) ) | ( p & NOT(v) );
|
|
c = ( q & s & x & NOT(t) ) | ( q & NOT(w) ) | ( q & NOT(v) );
|
|
d = r;
|
|
e = ( v & NOT(w) & x ) | ( s & v & w & x ) | ( NOT(t) & v & x & w );
|
|
f = ( p & t & v & w & x & NOT(s) ) | ( s & NOT(x) & v ) | ( s & NOT(v) );
|
|
g = ( q & t & w & v & x & NOT(s) ) | ( t & NOT(x) & v ) | ( t & NOT(v) );
|
|
h = u;
|
|
i = ( t & v & w & x ) | ( s & v & w & x ) | ( v & NOT(w) & NOT(x) );
|
|
j = ( p & NOT(s) & NOT(t) & w & v ) | ( s & v & NOT(w) & x )
|
|
| ( p & w & NOT(x) & v ) | ( w & NOT(v) );
|
|
k = ( q & NOT(s) & NOT(t) & v & w ) | ( t & v & NOT(w) & x )
|
|
| ( q & v & w & NOT(x) ) | ( x & NOT(v) );
|
|
m = y;
|
|
|
|
value = PUT(a, 11) | PUT(b, 10) | PUT(c, 9) | PUT(d, 8) | PUT(e, 7)
|
|
| PUT(f, 6) | PUT(g, 5) | PUT(h, 4) | PUT(i, 3) | PUT(j, 2)
|
|
| PUT(k, 1) | PUT(m, 0);
|
|
return value;
|
|
}
|
|
|
|
static ULong bcd_to_dpb( ULong chunk )
|
|
{
|
|
Short a, b, c, d, e, f, g, h, i, j, k, m;
|
|
Short p, q, r, s, t, u, v, w, x, y;
|
|
ULong value;
|
|
/* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value
|
|
The boolean equations to calculate the value of each of the DPD bit
|
|
is given in Appendix B of Book 1: Power ISA User Instruction set. The
|
|
bits for the DPD number are [abcdefghijkm]. The bits for the BCD value
|
|
are [pqrstuvwxy]. The boolean logic equations in psuedo C code are:
|
|
*/
|
|
a = GET( chunk, 11 );
|
|
b = GET( chunk, 10 );
|
|
c = GET( chunk, 9 );
|
|
d = GET( chunk, 8 );
|
|
e = GET( chunk, 7 );
|
|
f = GET( chunk, 6 );
|
|
g = GET( chunk, 5 );
|
|
h = GET( chunk, 4 );
|
|
i = GET( chunk, 3 );
|
|
j = GET( chunk, 2 );
|
|
k = GET( chunk, 1 );
|
|
m = GET( chunk, 0 );
|
|
|
|
p = ( f & a & i & NOT(e) ) | ( j & a & NOT(i) ) | ( b & NOT(a) );
|
|
q = ( g & a & i & NOT(e) ) | ( k & a & NOT(i) ) | ( c & NOT(a) );
|
|
r = d;
|
|
s = ( j & NOT(a) & e & NOT(i) ) | ( f & NOT(i) & NOT(e) )
|
|
| ( f & NOT(a) & NOT(e) ) | ( e & i );
|
|
t = ( k & NOT(a) & e & NOT(i) ) | ( g & NOT(i) & NOT(e) )
|
|
| ( g & NOT(a) & NOT(e) ) | ( a & i );
|
|
u = h;
|
|
v = a | e | i;
|
|
w = ( NOT(e) & j & NOT(i) ) | ( e & i ) | a;
|
|
x = ( NOT(a) & k & NOT(i) ) | ( a & i ) | e;
|
|
y = m;
|
|
|
|
value = PUT(p, 9) | PUT(q, 8) | PUT(r, 7) | PUT(s, 6) | PUT(t, 5)
|
|
| PUT(u, 4) | PUT(v, 3) | PUT(w, 2) | PUT(x, 1) | y;
|
|
|
|
return value;
|
|
}
|
|
|
|
ULong h_calc_DPBtoBCD( ULong dpb )
|
|
{
|
|
ULong result, chunk;
|
|
Int i;
|
|
|
|
result = 0;
|
|
|
|
for (i = 0; i < 5; i++) {
|
|
chunk = dpb >> ( 4 - i ) * 10;
|
|
result = result << 12;
|
|
result |= dpb_to_bcd( chunk & 0x3FF );
|
|
}
|
|
return result;
|
|
}
|
|
|
|
ULong h_calc_BCDtoDPB( ULong bcd )
|
|
{
|
|
ULong result, chunk;
|
|
Int i;
|
|
|
|
result = 0;
|
|
|
|
for (i = 0; i < 5; i++) {
|
|
chunk = bcd >> ( 4 - i ) * 12;
|
|
result = result << 10;
|
|
result |= bcd_to_dpb( chunk & 0xFFF );
|
|
}
|
|
return result;
|
|
}
|
|
#undef NOT
|
|
#undef GET
|
|
#undef PUT
|
|
|
|
|
|
/* ----------------------------------------------------- */
|
|
/* Signed and unsigned integer division, that behave like
|
|
the ARMv7 UDIV ansd SDIV instructions.
|
|
|
|
sdiv32 also behaves like 64-bit v8 SDIV on w-regs.
|
|
udiv32 also behaves like 64-bit v8 UDIV on w-regs.
|
|
*/
|
|
/* ----------------------------------------------------- */
|
|
|
|
UInt h_calc_udiv32_w_arm_semantics ( UInt x, UInt y )
|
|
{
|
|
// Division by zero --> zero
|
|
if (UNLIKELY(y == 0)) return 0;
|
|
// C requires rounding towards zero, which is also what we need.
|
|
return x / y;
|
|
}
|
|
|
|
ULong h_calc_udiv64_w_arm_semantics ( ULong x, ULong y )
|
|
{
|
|
// Division by zero --> zero
|
|
if (UNLIKELY(y == 0)) return 0;
|
|
// C requires rounding towards zero, which is also what we need.
|
|
return x / y;
|
|
}
|
|
|
|
Int h_calc_sdiv32_w_arm_semantics ( Int x, Int y )
|
|
{
|
|
// Division by zero --> zero
|
|
if (UNLIKELY(y == 0)) return 0;
|
|
// The single case that produces an unrepresentable result
|
|
if (UNLIKELY( ((UInt)x) == ((UInt)0x80000000)
|
|
&& ((UInt)y) == ((UInt)0xFFFFFFFF) ))
|
|
return (Int)(UInt)0x80000000;
|
|
// Else return the result rounded towards zero. C89 says
|
|
// this is implementation defined (in the signed case), but gcc
|
|
// promises to round towards zero. Nevertheless, at startup,
|
|
// in main_main.c, do a check for that.
|
|
return x / y;
|
|
}
|
|
|
|
Long h_calc_sdiv64_w_arm_semantics ( Long x, Long y )
|
|
{
|
|
// Division by zero --> zero
|
|
if (UNLIKELY(y == 0)) return 0;
|
|
// The single case that produces an unrepresentable result
|
|
if (UNLIKELY( ((ULong)x) == ((ULong)0x8000000000000000ULL )
|
|
&& ((ULong)y) == ((ULong)0xFFFFFFFFFFFFFFFFULL ) ))
|
|
return (Long)(ULong)0x8000000000000000ULL;
|
|
// Else return the result rounded towards zero. C89 says
|
|
// this is implementation defined (in the signed case), but gcc
|
|
// promises to round towards zero. Nevertheless, at startup,
|
|
// in main_main.c, do a check for that.
|
|
return x / y;
|
|
}
|
|
|
|
|
|
/*---------------------------------------------------------------*/
|
|
/*--- end host_generic_simd64.c ---*/
|
|
/*---------------------------------------------------------------*/
|