As I’m now nearing the end of my Honours Project, I’m focusing on optimising it. One of the main ways I have intended to optimize the project is to use the iPhones Vector Floating Point Unit and ARM assembly for some of the floating point math.
I’ve just completed my first test, and its been moderately successful. I’ve not properly timed yet, however it appears to have yeilded a 5-10% increase in speed, simply by changing my matrix3x3 multiply from C++ to using the VFP unit.
#define MATRIX3x1ANDVECTOR3x1 "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19"
#define VECTOR3SCALARx1 "s0", "s1", "s2"
#define SETVECTORWIDTH3 "fmrx r0, fpscr \n\t" \
"bic r0, r0, #0x00370000 \n\t" \
"orr r0, r0, #0x00020000 \n\t" \
"fmxr fpscr, r0 \n\t"
#define SETVECTORWIDTH1 "fmrx r0, fpscr \n\t" \
"bic r0, r0, #0x00370000 \n\t" \
"orr r0, r0, #0x00020000 \n\t" \
"fmxr fpscr, r0 \n\t"
IMatrix3x3 operator*(IMatrix3x3 & _A, IMatrix3x3 & _B)
{
IMatrix3x3 C;
//C++ code for the simulator
#if TARGET_IPHONE_SIMULATOR == true
C.A0 = _A.A0 * _B.A0 + _A.A1 * _B.B0 + _A.A2 * _B.C0;
C.A1 = _A.A0 * _B.A1 + _A.A1 * _B.B1 + _A.A2 * _B.C1;
C.A2 = _A.A0 * _B.A2 + _A.A1 * _B.B2 + _A.A2 * _B.C2;
C.B0 = _A.B0 * _B.A0 + _A.B1 * _B.B0 + _A.B2 * _B.C0;
C.B1 = _A.B0 * _B.A1 + _A.B1 * _B.B1 + _A.B2 * _B.C1;
C.B2 = _A.B0 * _B.A2 + _A.B1 * _B.B2 + _A.B2 * _B.C2;
C.C0 = _A.C0 * _B.A0 + _A.C1 * _B.B0 + _A.C2 * _B.C0;
C.C1 = _A.C0 * _B.A1 + _A.C1 * _B.B1 + _A.C2 * _B.C1;
C.C2 = _A.C0 * _B.A2 + _A.C1 * _B.B2 + _A.C2 * _B.C2;
//VPU ARM asm for the device
#else
//create a pointer to the Matrices
IMatrix3x3 * pA = &_A;
IMatrix3x3 * pB = &_B;
IMatrix3x3 * pC = &C;
//asm code
asm volatile(
//turn on a vector depth of 3
SETVECTORWIDTH3
//load matrix B into the vector bank
"fldmias %1, {s8-s16} \n\t"
//load the first row of A into the scalar bank
"fldmias %0!, {s0-s2} \n\t"
//calulate C.A0, C.A1 and C.A2
"fmuls s17, s8, s0 \n\t"
"fmacs s17, s11, s1 \n\t"
"fmacs s17, s14, s2 \n\t"
//save this into the output
"fstmias %2!, {s17-s19} \n\t"
//load the second row of A into the scalar bank
"fldmias %0!, {s0-s2} \n\t"
//calulate C.B0, C.B1 and C.B2
"fmuls s17, s8, s0 \n\t"
"fmacs s17, s11, s1 \n\t"
"fmacs s17, s14, s2 \n\t"
//save this into the output
"fstmias %2!, {s17-s19} \n\t"
//load the third row of A into the scalar bank
"fldmias %0!, {s0-s2} \n\t"
//calulate C.C0, C.C1 and C.C2
"fmuls s17, s8, s0 \n\t"
"fmacs s17, s11, s1 \n\t"
"fmacs s17, s14, s2 \n\t"
//save this into the output
"fstmias %2!, {s17-s19} \n\t"
//set the vector depth back to 1
SETVECTORWIDTH1
//pass the inputs and set the clobber list
: : "r"(pA), "r"(pB), "r" (pC)
: "memory",VECTOR3SCALARx1, MATRIX3x1ANDVECTOR3x1
);
#endif
return C;
}
The main problem here appears to be the fact that each time the vector width is changed it stalls. My next step is to attempt to batch together multiple matrix multiplies to reduce this stall and see if i can come up with a more significant speed increase.
Edit: The above code proved to be wrong for two reasons. First of all, I’m not including the “r0″ register in the clobber list which is being used to change the vector width. This was causing some memory problems, however there was a bigger issue with how I was calculating the matrix multiply. When writing this code, i was not aware that the registers were broken into 4 circular banks: s0-7, s8-15, s16-23 and s24-31. This means that if you were to use a vector width of 3 and use, for example, s14 as the first register( “fmacs s17, s14, s0″) it will wrap round such that the vector would be: (s14, s15, s8) NOT (s14,s15,s16) as i had thought. In addition to fixing this, I have optimised the code by calculating the multiplies using non dependant registers such that they can be performed at the same time.
Heres the updated, working code:
#define MATRIXMULTIPLYREGISTERS "s0", "s1", "s2", "s8", "s9", "s10","s11", "s12", "s13", "s16", "s17", "s18", "s19", "s20", "s21", "s24", "s25", "s26", "s27", "s28", "s29"
#define ALL "s0", "s1", "s2", "s3","s4", "s5", "s6", "s7", "s8", "s9", "s10","s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31"
#define SETVECTORWIDTH3 "fmrx r0, fpscr \n\t" \
"bic r0, r0, #0x00370000 \n\t" \
"orr r0, r0, #0x00020000 \n\t" \
"fmxr fpscr, r0 \n\t"
#define SETVECTORWIDTH1 "fmrx r0, fpscr \n\t" \
"bic r0, r0, #0x00370000 \n\t" \
"fmxr fpscr, r0 \n\t"
IMatrix3x3 operator*(IMatrix3x3 & _A, IMatrix3x3 & _B)
{
IMatrix3x3 C;
//C++ code for the simulator
#if TARGET_IPHONE_SIMULATOR == true
C.A0 = _A.A0 * _B.A0 + _A.A1 * _B.B0 + _A.A2 * _B.C0;
C.A1 = _A.A0 * _B.A1 + _A.A1 * _B.B1 + _A.A2 * _B.C1;
C.A2 = _A.A0 * _B.A2 + _A.A1 * _B.B2 + _A.A2 * _B.C2;
C.B0 = _A.B0 * _B.A0 + _A.B1 * _B.B0 + _A.B2 * _B.C0;
C.B1 = _A.B0 * _B.A1 + _A.B1 * _B.B1 + _A.B2 * _B.C1;
C.B2 = _A.B0 * _B.A2 + _A.B1 * _B.B2 + _A.B2 * _B.C2;
C.C0 = _A.C0 * _B.A0 + _A.C1 * _B.B0 + _A.C2 * _B.C0;
C.C1 = _A.C0 * _B.A1 + _A.C1 * _B.B1 + _A.C2 * _B.C1;
C.C2 = _A.C0 * _B.A2 + _A.C1 * _B.B2 + _A.C2 * _B.C2;
//VPU ARM asm for the device
#else
//create a pointer to the Matrices
IMatrix3x3 * pA = &_A;
IMatrix3x3 * pB = &_B;
IMatrix3x3 * pC = &C;
//asm code
asm volatile(
//turn on a vector depth of 3
SETVECTORWIDTH3
//load matrix B into the vector bank
"fldmias %1!, {s8-s13} \n\t"
"fldmias %1!, {s16-s18} \n\t"
//load the first row of A into the scalar bank
"fldmias %0!, {s0-s2} \n\t"
//calulate C.A0, C.A1 and C.A2
"fmuls s19, s8, s0 \n\t"
"fmuls s24, s11, s1 \n\t"
"fmuls s27, s16, s2 \n\t"
"fadds s19, s19, s24 \n\t"
"fadds s19, s19, s27 \n\t"
//save this into the output
"fstmias %2!, {s19-s21} \n\t"
//load the second row of A into the scalar bank
"fldmias %0!, {s0-s2} \n\t"
//calulate C.B0, C.B1 and C.B2
"fmuls s19, s8, s0 \n\t"
"fmuls s24, s11, s1 \n\t"
"fmuls s27, s16, s2 \n\t"
"fadds s19, s19, s24 \n\t"
"fadds s19, s19, s27 \n\t"
//save this into the output
"fstmias %2!, {s19-s21} \n\t"
//load the third row of A into the scalar bank
"fldmias %0!, {s0-s2} \n\t"
//calulate C.C0, C.C1 and C.C2
"fmuls s19, s8, s0 \n\t"
"fmuls s24, s11, s1 \n\t"
"fmuls s27, s16, s2 \n\t"
"fadds s19, s19, s24 \n\t"
"fadds s19, s19, s27 \n\t"
//save this into the output
"fstmias %2!, {s19-s21} \n\t"
//set the vector depth back to 1
SETVECTORWIDTH1
//pass the inputs and set the clobber list
: "=r"(pA), "=r" (pB), "=r" (pC) : "0" (pA), "1"(pB), "2"(pC)
:"r0", "memory", MATRIXMULTIPLYREGISTERS
);
#endif
return C;
}