Revision 3ca96802 libavcodec/ppc/h264_template_altivec.c
libavcodec/ppc/h264_template_altivec.c  

27  27 
((8  x) * (y)), 
28  28 
((x) * (y))}; 
29  29 
register int i; 
30 
vector unsigned char fperm;


31 
const vector signed int vABCD = vec_ld(0, ABCD);


32 
const vector signed short vA = vec_splat((vector signed short)vABCD, 1);


33 
const vector signed short vB = vec_splat((vector signed short)vABCD, 3);


34 
const vector signed short vC = vec_splat((vector signed short)vABCD, 5);


35 
const vector signed short vD = vec_splat((vector signed short)vABCD, 7);


36 
const vector signed int vzero = vec_splat_s32(0);


37 
const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));


38 
const vector unsigned short v6us = vec_splat_u16(6);


30 
vec_u8_t fperm;


31 
const vec_s32_t vABCD = vec_ld(0, ABCD);


32 
const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);


33 
const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);


34 
const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);


35 
const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);


36 
LOAD_ZERO;


37 
const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));


38 
const vec_u16_t v6us = vec_splat_u16(6);


39  39 
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 
40  40 
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 
41  41  
42 
vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;


43 
vector unsigned char vsrc0uc, vsrc1uc;


44 
vector signed short vsrc0ssH, vsrc1ssH;


45 
vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;


46 
vector signed short vsrc2ssH, vsrc3ssH, psum;


47 
vector unsigned char vdst, ppsum, vfdst, fsum;


42 
vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;


43 
vec_u8_t vsrc0uc, vsrc1uc;


44 
vec_s16_t vsrc0ssH, vsrc1ssH;


45 
vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;


46 
vec_s16_t vsrc2ssH, vsrc3ssH, psum;


47 
vec_u8_t vdst, ppsum, vfdst, fsum;


48  48  
49  49 
POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); 
50  50  
51  51 
if (((unsigned long)dst) % 16 == 0) { 
52 
fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,


53 
0x14, 0x15, 0x16, 0x17,


54 
0x08, 0x09, 0x0A, 0x0B,


55 
0x0C, 0x0D, 0x0E, 0x0F);


52 
fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,


53 
0x14, 0x15, 0x16, 0x17, 

54 
0x08, 0x09, 0x0A, 0x0B, 

55 
0x0C, 0x0D, 0x0E, 0x0F); 

56  56 
} else { 
57 
fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,


58 
0x04, 0x05, 0x06, 0x07,


59 
0x18, 0x19, 0x1A, 0x1B,


60 
0x1C, 0x1D, 0x1E, 0x1F);


57 
fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,


58 
0x04, 0x05, 0x06, 0x07, 

59 
0x18, 0x19, 0x1A, 0x1B, 

60 
0x1C, 0x1D, 0x1E, 0x1F); 

61  61 
} 
62  62  
63  63 
vsrcAuc = vec_ld(0, src); 
...  ...  
73  73 
else 
74  74 
vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); 
75  75  
76 
vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, 

77 
(vector unsigned char)vsrc0uc); 

78 
vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, 

79 
(vector unsigned char)vsrc1uc); 

76 
vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc); 

77 
vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc); 

80  78  
81  79 
if (!loadSecond) {// > !reallyBadAlign 
82  80 
for (i = 0 ; i < h ; i++) { 
...  ...  
87  85 
vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 
88  86 
vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 
89  87  
90 
vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, 

91 
(vector unsigned char)vsrc2uc); 

92 
vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, 

93 
(vector unsigned char)vsrc3uc); 

88 
vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc); 

89 
vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc); 

94  90  
95  91 
psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); 
96  92 
psum = vec_mladd(vB, vsrc1ssH, psum); 
...  ...  
100  96 
psum = vec_sra(psum, v6us); 
101  97  
102  98 
vdst = vec_ld(0, dst); 
103 
ppsum = (vector unsigned char)vec_packsu(psum, psum);


99 
ppsum = (vec_u8_t)vec_packsu(psum, psum);


104  100 
vfdst = vec_perm(vdst, ppsum, fperm); 
105  101  
106  102 
OP_U8_ALTIVEC(fsum, vfdst, vdst); 
...  ...  
114  110 
src += stride; 
115  111 
} 
116  112 
} else { 
117 
vector unsigned char vsrcDuc;


113 
vec_u8_t vsrcDuc;


118  114 
for (i = 0 ; i < h ; i++) { 
119  115 
vsrcCuc = vec_ld(stride + 0, src); 
120  116 
vsrcDuc = vec_ld(stride + 16, src); 
...  ...  
125  121 
else 
126  122 
vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 
127  123  
128 
vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, 

129 
(vector unsigned char)vsrc2uc); 

130 
vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, 

131 
(vector unsigned char)vsrc3uc); 

124 
vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc); 

125 
vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc); 

132  126  
133  127 
psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); 
134  128 
psum = vec_mladd(vB, vsrc1ssH, psum); 
...  ...  
138  132 
psum = vec_sr(psum, v6us); 
139  133  
140  134 
vdst = vec_ld(0, dst); 
141 
ppsum = (vector unsigned char)vec_pack(psum, psum);


135 
ppsum = (vec_u8_t)vec_pack(psum, psum);


142  136 
vfdst = vec_perm(vdst, ppsum, fperm); 
143  137  
144  138 
OP_U8_ALTIVEC(fsum, vfdst, vdst); 
...  ...  
160  154 
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); 
161  155 
register int i; 
162  156  
163 
const vector signed int vzero = vec_splat_s32(0); 

164 
const vector unsigned char permM2 = vec_lvsl(2, src); 

165 
const vector unsigned char permM1 = vec_lvsl(1, src); 

166 
const vector unsigned char permP0 = vec_lvsl(+0, src); 

167 
const vector unsigned char permP1 = vec_lvsl(+1, src); 

168 
const vector unsigned char permP2 = vec_lvsl(+2, src); 

169 
const vector unsigned char permP3 = vec_lvsl(+3, src); 

170 
const vector signed short v5ss = vec_splat_s16(5); 

171 
const vector unsigned short v5us = vec_splat_u16(5); 

172 
const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 

173 
const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 

174 
const vector unsigned char dstperm = vec_lvsr(0, dst); 

175 
const vector unsigned char neg1 = 

176 
(const vector unsigned char) vec_splat_s8(1); 

177  
178 
const vector unsigned char dstmask = 

179 
vec_perm((const vector unsigned char)vzero, 

180 
neg1, dstperm); 

181  
182 
vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 

157 
LOAD_ZERO; 

158 
const vec_u8_t permM2 = vec_lvsl(2, src); 

159 
const vec_u8_t permM1 = vec_lvsl(1, src); 

160 
const vec_u8_t permP0 = vec_lvsl(+0, src); 

161 
const vec_u8_t permP1 = vec_lvsl(+1, src); 

162 
const vec_u8_t permP2 = vec_lvsl(+2, src); 

163 
const vec_u8_t permP3 = vec_lvsl(+3, src); 

164 
const vec_s16_t v5ss = vec_splat_s16(5); 

165 
const vec_u16_t v5us = vec_splat_u16(5); 

166 
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 

167 
const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 

168 
const vec_u8_t dstperm = vec_lvsr(0, dst); 

169 
const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(1); 

170 
const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm); 

171  
172 
vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 

183  173  
184  174 
register int align = ((((unsigned long)src)  2) % 16); 
185  175  
186 
vector signed short srcP0A, srcP0B, srcP1A, srcP1B,


176 
vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,


187  177 
srcP2A, srcP2B, srcP3A, srcP3B, 
188  178 
srcM1A, srcM1B, srcM2A, srcM2B, 
189  179 
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 
190  180 
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 
191  181 
psumA, psumB, sumA, sumB; 
192  182  
193 
vector unsigned char sum, dst1, dst2, vdst, fsum, 

194 
rsum, fdst1, fdst2; 

183 
vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2; 

195  184  
196  185 
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); 
197  186  
198  187 
for (i = 0 ; i < 16 ; i ++) { 
199 
vector unsigned char srcR1 = vec_ld(2, src);


200 
vector unsigned char srcR2 = vec_ld(14, src);


188 
vec_u8_t srcR1 = vec_ld(2, src);


189 
vec_u8_t srcR2 = vec_ld(14, src);


201  190  
202  191 
switch (align) { 
203  192 
default: { 
...  ...  
217  206 
srcP3 = srcR2; 
218  207 
} break; 
219  208 
case 12: { 
220 
vector unsigned char srcR3 = vec_ld(30, src);


209 
vec_u8_t srcR3 = vec_ld(30, src);


221  210 
srcM2 = vec_perm(srcR1, srcR2, permM2); 
222  211 
srcM1 = vec_perm(srcR1, srcR2, permM1); 
223  212 
srcP0 = vec_perm(srcR1, srcR2, permP0); 
...  ...  
226  215 
srcP3 = vec_perm(srcR2, srcR3, permP3); 
227  216 
} break; 
228  217 
case 13: { 
229 
vector unsigned char srcR3 = vec_ld(30, src);


218 
vec_u8_t srcR3 = vec_ld(30, src);


230  219 
srcM2 = vec_perm(srcR1, srcR2, permM2); 
231  220 
srcM1 = vec_perm(srcR1, srcR2, permM1); 
232  221 
srcP0 = vec_perm(srcR1, srcR2, permP0); 
...  ...  
235  224 
srcP3 = vec_perm(srcR2, srcR3, permP3); 
236  225 
} break; 
237  226 
case 14: { 
238 
vector unsigned char srcR3 = vec_ld(30, src);


227 
vec_u8_t srcR3 = vec_ld(30, src);


239  228 
srcM2 = vec_perm(srcR1, srcR2, permM2); 
240  229 
srcM1 = vec_perm(srcR1, srcR2, permM1); 
241  230 
srcP0 = srcR2; 
...  ...  
244  233 
srcP3 = vec_perm(srcR2, srcR3, permP3); 
245  234 
} break; 
246  235 
case 15: { 
247 
vector unsigned char srcR3 = vec_ld(30, src);


236 
vec_u8_t srcR3 = vec_ld(30, src);


248  237 
srcM2 = vec_perm(srcR1, srcR2, permM2); 
249  238 
srcM1 = srcR2; 
250  239 
srcP0 = vec_perm(srcR2, srcR3, permP0); 
...  ...  
254  243 
} break; 
255  244 
} 
256  245  
257 
srcP0A = (vector signed short) 

258 
vec_mergeh((vector unsigned char)vzero, srcP0); 

259 
srcP0B = (vector signed short) 

260 
vec_mergel((vector unsigned char)vzero, srcP0); 

261 
srcP1A = (vector signed short) 

262 
vec_mergeh((vector unsigned char)vzero, srcP1); 

263 
srcP1B = (vector signed short) 

264 
vec_mergel((vector unsigned char)vzero, srcP1); 

265  
266 
srcP2A = (vector signed short) 

267 
vec_mergeh((vector unsigned char)vzero, srcP2); 

268 
srcP2B = (vector signed short) 

269 
vec_mergel((vector unsigned char)vzero, srcP2); 

270 
srcP3A = (vector signed short) 

271 
vec_mergeh((vector unsigned char)vzero, srcP3); 

272 
srcP3B = (vector signed short) 

273 
vec_mergel((vector unsigned char)vzero, srcP3); 

274  
275 
srcM1A = (vector signed short) 

276 
vec_mergeh((vector unsigned char)vzero, srcM1); 

277 
srcM1B = (vector signed short) 

278 
vec_mergel((vector unsigned char)vzero, srcM1); 

279 
srcM2A = (vector signed short) 

280 
vec_mergeh((vector unsigned char)vzero, srcM2); 

281 
srcM2B = (vector signed short) 

282 
vec_mergel((vector unsigned char)vzero, srcM2); 

246 
srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); 

247 
srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); 

248 
srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); 

249 
srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); 

250  
251 
srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); 

252 
srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); 

253 
srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); 

254 
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); 

255  
256 
srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); 

257 
srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); 

258 
srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); 

259 
srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); 

283  260  
284  261 
sum1A = vec_adds(srcP0A, srcP1A); 
285  262 
sum1B = vec_adds(srcP0B, srcP1B); 
...  ...  
291  268 
pp1A = vec_mladd(sum1A, v20ss, v16ss); 
292  269 
pp1B = vec_mladd(sum1B, v20ss, v16ss); 
293  270  
294 
pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);


295 
pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);


271 
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);


272 
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);


296  273  
297  274 
pp3A = vec_add(sum3A, pp1A); 
298  275 
pp3B = vec_add(sum3B, pp1B); 
...  ...  
330  307  
331  308 
register int i; 
332  309  
333 
const vector signed int vzero = vec_splat_s32(0);


334 
const vector unsigned char perm = vec_lvsl(0, src);


335 
const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));


336 
const vector unsigned short v5us = vec_splat_u16(5);


337 
const vector signed short v5ss = vec_splat_s16(5);


338 
const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));


339 
const vector unsigned char dstperm = vec_lvsr(0, dst);


340 
const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(1);


341 
const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);


310 
LOAD_ZERO;


311 
const vec_u8_t perm = vec_lvsl(0, src);


312 
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));


313 
const vec_u16_t v5us = vec_splat_u16(5);


314 
const vec_s16_t v5ss = vec_splat_s16(5);


315 
const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));


316 
const vec_u8_t dstperm = vec_lvsr(0, dst);


317 
const vec_u8_t neg1 = (const vec_u8_t)vec_splat_s8(1);


318 
const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);


342  319  
343  320 
uint8_t *srcbis = src  (srcStride * 2); 
344  321  
345 
const vector unsigned char srcM2a = vec_ld(0, srcbis);


346 
const vector unsigned char srcM2b = vec_ld(16, srcbis);


347 
const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm);


322 
const vec_u8_t srcM2a = vec_ld(0, srcbis);


323 
const vec_u8_t srcM2b = vec_ld(16, srcbis);


324 
const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);


348  325 
// srcbis += srcStride; 
349 
const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride);


350 
const vector unsigned char srcM1b = vec_ld(16, srcbis);


351 
const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm);


326 
const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);


327 
const vec_u8_t srcM1b = vec_ld(16, srcbis);


328 
const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);


352  329 
// srcbis += srcStride; 
353 
const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride);


354 
const vector unsigned char srcP0b = vec_ld(16, srcbis);


355 
const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm);


330 
const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);


331 
const vec_u8_t srcP0b = vec_ld(16, srcbis);


332 
const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);


356  333 
// srcbis += srcStride; 
357 
const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride);


358 
const vector unsigned char srcP1b = vec_ld(16, srcbis);


359 
const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm);


334 
const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);


335 
const vec_u8_t srcP1b = vec_ld(16, srcbis);


336 
const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);


360  337 
// srcbis += srcStride; 
361 
const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride);


362 
const vector unsigned char srcP2b = vec_ld(16, srcbis);


363 
const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm);


338 
const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);


339 
const vec_u8_t srcP2b = vec_ld(16, srcbis);


340 
const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);


364  341 
// srcbis += srcStride; 
365  342  
366 
vector signed short srcM2ssA = (vector signed short) 

367 
vec_mergeh((vector unsigned char)vzero, srcM2); 

368 
vector signed short srcM2ssB = (vector signed short) 

369 
vec_mergel((vector unsigned char)vzero, srcM2); 

370 
vector signed short srcM1ssA = (vector signed short) 

371 
vec_mergeh((vector unsigned char)vzero, srcM1); 

372 
vector signed short srcM1ssB = (vector signed short) 

373 
vec_mergel((vector unsigned char)vzero, srcM1); 

374 
vector signed short srcP0ssA = (vector signed short) 

375 
vec_mergeh((vector unsigned char)vzero, srcP0); 

376 
vector signed short srcP0ssB = (vector signed short) 

377 
vec_mergel((vector unsigned char)vzero, srcP0); 

378 
vector signed short srcP1ssA = (vector signed short) 

379 
vec_mergeh((vector unsigned char)vzero, srcP1); 

380 
vector signed short srcP1ssB = (vector signed short) 

381 
vec_mergel((vector unsigned char)vzero, srcP1); 

382 
vector signed short srcP2ssA = (vector signed short) 

383 
vec_mergeh((vector unsigned char)vzero, srcP2); 

384 
vector signed short srcP2ssB = (vector signed short) 

385 
vec_mergel((vector unsigned char)vzero, srcP2); 

386  
387 
vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 

343 
vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); 

344 
vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2); 

345 
vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); 

346 
vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1); 

347 
vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); 

348 
vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0); 

349 
vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); 

350 
vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1); 

351 
vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); 

352 
vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2); 

353  
354 
vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 

388  355 
psumA, psumB, sumA, sumB, 
389  356 
srcP3ssA, srcP3ssB, 
390  357 
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; 
391  358  
392 
vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2, 

393 
srcP3a, srcP3b, srcP3; 

359 
vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2, srcP3a, srcP3b, srcP3; 

394  360  
395  361 
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); 
396  362  
...  ...  
398  364 
srcP3a = vec_ld(0, srcbis += srcStride); 
399  365 
srcP3b = vec_ld(16, srcbis); 
400  366 
srcP3 = vec_perm(srcP3a, srcP3b, perm); 
401 
srcP3ssA = (vector signed short) 

402 
vec_mergeh((vector unsigned char)vzero, srcP3); 

403 
srcP3ssB = (vector signed short) 

404 
vec_mergel((vector unsigned char)vzero, srcP3); 

367 
srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); 

368 
srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3); 

405  369 
// srcbis += srcStride; 
406  370  
407  371 
sum1A = vec_adds(srcP0ssA, srcP1ssA); 
...  ...  
425  389 
pp1A = vec_mladd(sum1A, v20ss, v16ss); 
426  390 
pp1B = vec_mladd(sum1B, v20ss, v16ss); 
427  391  
428 
pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);


429 
pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);


392 
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);


393 
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);


430  394  
431  395 
pp3A = vec_add(sum3A, pp1A); 
432  396 
pp3B = vec_add(sum3B, pp1B); 
...  ...  
461  425 
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { 
462  426 
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); 
463  427 
register int i; 
464 
const vector signed int vzero = vec_splat_s32(0);


465 
const vector unsigned char permM2 = vec_lvsl(2, src);


466 
const vector unsigned char permM1 = vec_lvsl(1, src);


467 
const vector unsigned char permP0 = vec_lvsl(+0, src);


468 
const vector unsigned char permP1 = vec_lvsl(+1, src);


469 
const vector unsigned char permP2 = vec_lvsl(+2, src);


470 
const vector unsigned char permP3 = vec_lvsl(+3, src);


471 
const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));


472 
const vector unsigned int v10ui = vec_splat_u32(10);


473 
const vector signed short v5ss = vec_splat_s16(5);


474 
const vector signed short v1ss = vec_splat_s16(1);


475 
const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));


476 
const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));


428 
LOAD_ZERO;


429 
const vec_u8_t permM2 = vec_lvsl(2, src);


430 
const vec_u8_t permM1 = vec_lvsl(1, src);


431 
const vec_u8_t permP0 = vec_lvsl(+0, src);


432 
const vec_u8_t permP1 = vec_lvsl(+1, src);


433 
const vec_u8_t permP2 = vec_lvsl(+2, src);


434 
const vec_u8_t permP3 = vec_lvsl(+3, src);


435 
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));


436 
const vec_u32_t v10ui = vec_splat_u32(10);


437 
const vec_s16_t v5ss = vec_splat_s16(5);


438 
const vec_s16_t v1ss = vec_splat_s16(1);


439 
const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));


440 
const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));


477  441  
478  442 
register int align = ((((unsigned long)src)  2) % 16); 
479  443  
480 
const vector unsigned char neg1 = (const vector unsigned char) 

481 
vec_splat_s8(1); 

444 
const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(1); 

482  445  
483 
vector signed short srcP0A, srcP0B, srcP1A, srcP1B,


446 
vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,


484  447 
srcP2A, srcP2B, srcP3A, srcP3B, 
485  448 
srcM1A, srcM1B, srcM2A, srcM2B, 
486  449 
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 
487  450 
pp1A, pp1B, pp2A, pp2B, psumA, psumB; 
488  451  
489 
const vector unsigned char dstperm = vec_lvsr(0, dst);


452 
const vec_u8_t dstperm = vec_lvsr(0, dst);


490  453  
491 
const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);


454 
const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);


492  455  
493 
const vector unsigned char mperm = (const vector unsigned char)


456 
const vec_u8_t mperm = (const vec_u8_t)


494  457 
AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 
495  458 
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); 
496  459 
int16_t *tmpbis = tmp; 
497  460  
498 
vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,


461 
vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,


499  462 
tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, 
500  463 
tmpP2ssA, tmpP2ssB; 
501  464  
502 
vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,


465 
vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,


503  466 
pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, 
504  467 
pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, 
505  468 
ssumAe, ssumAo, ssumBe, ssumBo; 
506 
vector unsigned char fsum, sumv, sum, dst1, dst2, vdst, 

507 
rsum, fdst1, fdst2; 

508 
vector signed short ssume, ssumo; 

469 
vec_u8_t fsum, sumv, sum, dst1, dst2, vdst, rsum, fdst1, fdst2; 

470 
vec_s16_t ssume, ssumo; 

509  471  
510  472 
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); 
511  473 
src = (2 * srcStride); 
512  474 
for (i = 0 ; i < 21 ; i ++) { 
513 
vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;


514 
vector unsigned char srcR1 = vec_ld(2, src);


515 
vector unsigned char srcR2 = vec_ld(14, src);


475 
vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;


476 
vec_u8_t srcR1 = vec_ld(2, src);


477 
vec_u8_t srcR2 = vec_ld(14, src);


516  478  
517  479 
switch (align) { 
518  480 
default: { 
...  ...  
532  494 
srcP3 = srcR2; 
533  495 
} break; 
534  496 
case 12: { 
535 
vector unsigned char srcR3 = vec_ld(30, src);


497 
vec_u8_t srcR3 = vec_ld(30, src);


536  498 
srcM2 = vec_perm(srcR1, srcR2, permM2); 
537  499 
srcM1 = vec_perm(srcR1, srcR2, permM1); 
538  500 
srcP0 = vec_perm(srcR1, srcR2, permP0); 
...  ...  
541  503 
srcP3 = vec_perm(srcR2, srcR3, permP3); 
542  504 
} break; 
543  505 
case 13: { 
544 
vector unsigned char srcR3 = vec_ld(30, src);


506 
vec_u8_t srcR3 = vec_ld(30, src);


545  507 
srcM2 = vec_perm(srcR1, srcR2, permM2); 
546  508 
srcM1 = vec_perm(srcR1, srcR2, permM1); 
547  509 
srcP0 = vec_perm(srcR1, srcR2, permP0); 
...  ...  
550  512 
srcP3 = vec_perm(srcR2, srcR3, permP3); 
551  513 
} break; 
552  514 
case 14: { 
553 
vector unsigned char srcR3 = vec_ld(30, src);


515 
vec_u8_t srcR3 = vec_ld(30, src);


554  516 
srcM2 = vec_perm(srcR1, srcR2, permM2); 
555  517 
srcM1 = vec_perm(srcR1, srcR2, permM1); 
556  518 
srcP0 = srcR2; 
...  ...  
559  521 
srcP3 = vec_perm(srcR2, srcR3, permP3); 
560  522 
} break; 
561  523 
case 15: { 
562 
vector unsigned char srcR3 = vec_ld(30, src);


524 
vec_u8_t srcR3 = vec_ld(30, src);


563  525 
srcM2 = vec_perm(srcR1, srcR2, permM2); 
564  526 
srcM1 = srcR2; 
565  527 
srcP0 = vec_perm(srcR2, srcR3, permP0); 
...  ...  
569  531 
} break; 
570  532 
} 
571  533  
572 
srcP0A = (vector signed short) 

573 
vec_mergeh((vector unsigned char)vzero, srcP0); 

574 
srcP0B = (vector signed short) 

575 
vec_mergel((vector unsigned char)vzero, srcP0); 

576 
srcP1A = (vector signed short) 

577 
vec_mergeh((vector unsigned char)vzero, srcP1); 

578 
srcP1B = (vector signed short) 

579 
vec_mergel((vector unsigned char)vzero, srcP1); 

580  
581 
srcP2A = (vector signed short) 

582 
vec_mergeh((vector unsigned char)vzero, srcP2); 

583 
srcP2B = (vector signed short) 

584 
vec_mergel((vector unsigned char)vzero, srcP2); 

585 
srcP3A = (vector signed short) 

586 
vec_mergeh((vector unsigned char)vzero, srcP3); 

587 
srcP3B = (vector signed short) 

588 
vec_mergel((vector unsigned char)vzero, srcP3); 

589  
590 
srcM1A = (vector signed short) 

591 
vec_mergeh((vector unsigned char)vzero, srcM1); 

592 
srcM1B = (vector signed short) 

593 
vec_mergel((vector unsigned char)vzero, srcM1); 

594 
srcM2A = (vector signed short) 

595 
vec_mergeh((vector unsigned char)vzero, srcM2); 

596 
srcM2B = (vector signed short) 

597 
vec_mergel((vector unsigned char)vzero, srcM2); 

534 
srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); 

535 
srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); 

536 
srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); 

537 
srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); 

538  
539 
srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); 

540 
srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); 

541 
srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); 

542 
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); 

543  
544 
srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); 

545 
srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); 

546 
srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); 

547 
srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); 

598  548  
599  549 
sum1A = vec_adds(srcP0A, srcP1A); 
600  550 
sum1B = vec_adds(srcP0B, srcP1B); 
...  ...  
606  556 
pp1A = vec_mladd(sum1A, v20ss, sum3A); 
607  557 
pp1B = vec_mladd(sum1B, v20ss, sum3B); 
608  558  
609 
pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);


610 
pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);


559 
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);


560 
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);


611  561  
612  562 
psumA = vec_sub(pp1A, pp2A); 
613  563 
psumB = vec_sub(pp1B, pp2B); 
...  ...  
636  586 
tmpbis += tmpStride; 
637  587  
638  588 
for (i = 0 ; i < 16 ; i++) { 
639 
const vector signed short tmpP3ssA = vec_ld(0, tmpbis);


640 
const vector signed short tmpP3ssB = vec_ld(16, tmpbis);


589 
const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);


590 
const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);


641  591  
642 
const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);


643 
const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);


644 
const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA);


645 
const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB);


646 
const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);


647 
const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);


592 
const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);


593 
const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);


594 
const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);


595 
const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);


596 
const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);


597 
const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);


648  598  
649  599 
tmpbis += tmpStride; 
650  600  
...  ...  
669  619 
pp2Be = vec_mule(sum2B, v5ss); 
670  620 
pp2Bo = vec_mulo(sum2B, v5ss); 
671  621  
672 
pp3Ae = vec_sra((vector signed int)sum3A, v16ui);


622 
pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);


673  623 
pp3Ao = vec_mulo(sum3A, v1ss); 
674 
pp3Be = vec_sra((vector signed int)sum3B, v16ui);


624 
pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);


675  625 
pp3Bo = vec_mulo(sum3B, v1ss); 
676  626  
677  627 
pp1cAe = vec_add(pp1Ae, v512si); 
Also available in: Unified diff