Example 8–12. Vectorized form of the Vector Complex Multiply Kernel
void vec_cx_mpy(const short *restrict a, const short *restrict b,
{
int i;
unsigned a3_a2, a1_a0;
unsigned b3_b2, b1_b0;
short
a3, a2, a1, a0;
short
b3, b2, b1, b0;
short
c3, c2, c1, c0;
unsigned c3_c2, c1_c0;
for (i = 0; i < len; i += 4)
{
/* Load two complex numbers from the a[] array.
/* The complex values loaded are represented as 'a3 + a2 * j' */
/* and 'a1 + a0 * j'.
/* and a1, and the imaginary components are a2 and a0.
a3_a2 = _hi(*(const double *) &a[i]);
a1_a0 = _lo(*(const double *) &a[i]);
/* Load two complex numbers from the b[] array.
b3_b2 = _hi(*(const double *) &b[i]);
b1_b0 = _lo(*(const double *) &b[i]);
/* Separate the 16–bit coefficients so that the complex
/* multiply may be performed.
/* optimization.
a3 = ((signed) a3_a2) >> 16;
a2 = _ext(a3_a2, 16, 16);
a1 = ((signed) a1_a0) >> 16;
a0 = _ext(a1_a0, 16, 16);
b3 = ((signed) a3_a2) >> 16;
b2 = _ext(a3_a2, 16, 16);
b1 = ((signed) a1_a0) >> 16;
b0 = _ext(a1_a0, 16, 16);
/* Perform the complex multiplies using 16x16 multiplies.
c3 = (b3 * a2 + b2 * a3) >> 16;
c2 = (b3 * a3 – b2 * a2) >> 16;
c1 = (b1 * a0 + b0 * a1) >> 16;
c0 = (b1 * a1 – b0 * a0) >> 16;
/* Pack the 16–bit results into 32–bit words.
c3_c2 = _pack2(c3, c2);
c1_c0 = _pack2(c1, c0);
/* Store the results. */
*(double *) &c[i] = _itod(c3_c2, c1_c0);
}
}
short *restrict c, int len, int shift)
That is, the real components are a3
Packed-Data Processing on the 'C64x
/* Packed 16–bit values
/* Packed 16–bit values
/* Separate 16–bit elements
/* Separate 16–bit elements
/* Separate 16–bit results
/* Packed 16–bit values
This portion needs further
'C64x Programming Considerations
*/
*/
*/
*/
*/
*/
*/
*/
*/
*/
*/
*/
*/
*/
*/
8-33
Need help?
Do you have a question about the TMS320C6000 and is the answer not in the manual?