Lect.10.arm soc.4 neon
底下就用 NEON 的概念來寫Parallel support,不過 compiler/Hardware 並沒有支援.所以還是只有自 High 而已...XD
View more presentations from sean chen.
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#define BUF_SIZE 4
struct ABC {
uint8_t A;
uint8_t B;
uint8_t C;
} ABC_BUF[BUF_SIZE];
struct XYZ {
uint8_t X;
uint8_t Y;
uint8_t Z;
} XYZ_BUF[BUF_SIZE]= {
{ 0, 0, 0},
{ 25, 25, 25},
{ 3, 5, 9},
{ 7, 5, 3},
};
void C_XYZ2ABC(){
int i;
for(i=0; i<BUF_SIZE; i++){
ABC_BUF[i].A = XYZ_BUF[i].X + XYZ_BUF[i].Y + XYZ_BUF[i].Z;
ABC_BUF[i].B = XYZ_BUF[i].X + XYZ_BUF[i].Y - XYZ_BUF[i].Z;
ABC_BUF[i].C = XYZ_BUF[i].X - XYZ_BUF[i].Y - XYZ_BUF[i].Z;
}
}
void ASSEMBLY_XYZ2ABC(){
int i;
uint8_t tA,tB,tC;
for( i=0; i<BUF_SIZE; i++ ){
tA = (XYZ_BUF[i].X + XYZ_BUF[i].Y);
ABC_BUF[i].A = (tA + XYZ_BUF[i].Z);
tB = (XYZ_BUF[i].X + XYZ_BUF[i].Y);
ABC_BUF[i].B = (tB - XYZ_BUF[i].Z);
tC = (XYZ_BUF[i].X - XYZ_BUF[i].Y);
ABC_BUF[i].C = (tC - XYZ_BUF[i].Z);
}
}
uint32_t Exp4x8uint_8(uint8_t i24,uint8_t i16,uint8_t i8,uint8_t i0){
uint32_t r;
r = i24<<24;
r += i16<<16;
r += i8<<8;
r += i0;
return r;
}
uint32_t Exe4x8uint_32(uint32_t a32,uint32_t b32,uint8_t op){
uint32_t r;
uint8_t ra24 = a32>>24;
uint8_t ra16 = a32>>16;
uint8_t ra8 = a32>>8;
uint8_t ra0 = a32;
uint8_t rb24 = b32>>24;
uint8_t rb16 = b32>>16;
uint8_t rb8 = b32>>8;
uint8_t rb0 = b32;
switch(op){
case 0: ra24 += rb24; ra16 += rb16; ra8 += rb8; ra0 += rb0; break;
case 1: ra24 -= rb24; ra16 -= rb16; ra8 -= rb8; ra0 -= rb0; break;
}
r = ra24<<24;
r += ra16<<16;
r += ra8<<8;
r += ra0;
return r;
}
void NEON_XYZ2ABC(){
int i;
uint32_t X,Y,Z;
uint32_t A,B,C;
for( i=0; i<BUF_SIZE; i=i+4 ){
X = Exp4x8uint_8(XYZ_BUF[i+3].X, XYZ_BUF[i+2].X, XYZ_BUF[i+1].X, XYZ_BUF[i ].X);
Y = Exp4x8uint_8(XYZ_BUF[i+3].Y, XYZ_BUF[i+2].Y, XYZ_BUF[i+1].Y, XYZ_BUF[i ].Y);
Z = Exp4x8uint_8(XYZ_BUF[i+3].Z, XYZ_BUF[i+2].Z, XYZ_BUF[i+1].Z, XYZ_BUF[i ].Z);
A = Exe4x8uint_32(X,Y,0);
A = Exe4x8uint_32(A,Z,0);
B = Exe4x8uint_32(X,Y,0);
B = Exe4x8uint_32(B,Z,1);
C = Exe4x8uint_32(X,Y,1);
C = Exe4x8uint_32(C,Z,1);
ABC_BUF[i+3].A = A>>24; ABC_BUF[i+2].A = A>>16; ABC_BUF[i+1].A = A>>8; ABC_BUF[i].A = A;
ABC_BUF[i+3].B = B>>24; ABC_BUF[i+2].B = B>>16; ABC_BUF[i+1].B = B>>8; ABC_BUF[i].B = B;
ABC_BUF[i+3].C = C>>24; ABC_BUF[i+2].C = C>>16; ABC_BUF[i+1].C = C>>8; ABC_BUF[i].C = C;
}
}
void Display_ABC(){
int i;
for( i=0; i<BUF_SIZE; i++ ){
printf("I :: %3x,",i);
printf("A[%2d]:: %3x,",i,ABC_BUF[i].A);
printf("B[%2d]:: %3x,",i,ABC_BUF[i].B);
printf("C[%2d]:: %3x,\n",i,ABC_BUF[i].C);
}
printf("\n");
}
int main(int argc,char *argv[]){
printf("C Code Result...\n");
C_XYZ2ABC();
Display_ABC();
printf("NEON Result...\n");
NEON_XYZ2ABC();
Display_ABC();
return 0;
}
code download here
Refs:
RGB 2 YUV
Optimizing Code for ARM Cortex-A8 with NEON SIMD
沒有留言:
張貼留言