## ffmpeg / libavcodec / x86 / h264_weight.asm @ fbb6b49d

History | View | Annotate | Download (8.33 KB)

1 |
;***************************************************************************** |
---|---|

2 |
;* SSE2-optimized weighted prediction code |

3 |
;***************************************************************************** |

4 |
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt |

5 |
;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com> |

6 |
;* |

7 |
;* This file is part of FFmpeg. |

8 |
;* |

9 |
;* FFmpeg is free software; you can redistribute it and/or |

10 |
;* modify it under the terms of the GNU Lesser General Public |

11 |
;* License as published by the Free Software Foundation; either |

12 |
;* version 2.1 of the License, or (at your option) any later version. |

13 |
;* |

14 |
;* FFmpeg is distributed in the hope that it will be useful, |

15 |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

16 |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

17 |
;* Lesser General Public License for more details. |

18 |
;* |

19 |
;* You should have received a copy of the GNU Lesser General Public |

20 |
;* License along with FFmpeg; if not, write to the Free Software |

21 |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

22 |
;****************************************************************************** |

23 | |

24 |
%include "x86inc.asm" |

25 | |

26 |
SECTION .text |

27 | |

28 |
;----------------------------------------------------------------------------- |

29 |
; biweight pred: |

30 |
; |

31 |
; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, |

32 |
; int log2_denom, int weightd, int weights, |

33 |
; int offset); |

34 |
; and |

35 |
; void h264_weight_16x16_sse2(uint8_t *dst, int stride, |

36 |
; int log2_denom, int weight, |

37 |
; int offset); |

38 |
;----------------------------------------------------------------------------- |

39 | |

40 |
%macro WEIGHT_SETUP 0 |

41 |
add r4, r4 |

42 |
inc r4 |

43 |
movd m3, r3d |

44 |
movd m5, r4d |

45 |
movd m6, r2d |

46 |
pslld m5, m6 |

47 |
psrld m5, 1 |

48 |
%if mmsize == 16 |

49 |
pshuflw m3, m3, 0 |

50 |
pshuflw m5, m5, 0 |

51 |
punpcklqdq m3, m3 |

52 |
punpcklqdq m5, m5 |

53 |
%else |

54 |
pshufw m3, m3, 0 |

55 |
pshufw m5, m5, 0 |

56 |
%endif |

57 |
pxor m7, m7 |

58 |
%endmacro |

59 | |

60 |
%macro WEIGHT_OP 2 |

61 |
movh m0, [r0+%1] |

62 |
movh m1, [r0+%2] |

63 |
punpcklbw m0, m7 |

64 |
punpcklbw m1, m7 |

65 |
pmullw m0, m3 |

66 |
pmullw m1, m3 |

67 |
paddsw m0, m5 |

68 |
paddsw m1, m5 |

69 |
psraw m0, m6 |

70 |
psraw m1, m6 |

71 |
packuswb m0, m1 |

72 |
%endmacro |

73 | |

74 |
%macro WEIGHT_FUNC_DBL_MM 1 |

75 |
cglobal h264_weight_16x%1_mmx2, 5, 5, 0 |

76 |
WEIGHT_SETUP |

77 |
mov r2, %1 |

78 |
%if %1 == 16 |

79 |
.nextrow |

80 |
WEIGHT_OP 0, 4 |

81 |
mova [r0 ], m0 |

82 |
WEIGHT_OP 8, 12 |

83 |
mova [r0+8], m0 |

84 |
add r0, r1 |

85 |
dec r2 |

86 |
jnz .nextrow |

87 |
REP_RET |

88 |
%else |

89 |
jmp mangle(ff_h264_weight_16x16_mmx2.nextrow) |

90 |
%endif |

91 |
%endmacro |

92 | |

93 |
INIT_MMX |

94 |
WEIGHT_FUNC_DBL_MM 16 |

95 |
WEIGHT_FUNC_DBL_MM 8 |

96 | |

97 |
%macro WEIGHT_FUNC_MM 4 |

98 |
cglobal h264_weight_%1x%2_%4, 7, 7, %3 |

99 |
WEIGHT_SETUP |

100 |
mov r2, %2 |

101 |
%if %2 == 16 |

102 |
.nextrow |

103 |
WEIGHT_OP 0, mmsize/2 |

104 |
mova [r0], m0 |

105 |
add r0, r1 |

106 |
dec r2 |

107 |
jnz .nextrow |

108 |
REP_RET |

109 |
%else |

110 |
jmp mangle(ff_h264_weight_%1x16_%4.nextrow) |

111 |
%endif |

112 |
%endmacro |

113 | |

114 |
INIT_MMX |

115 |
WEIGHT_FUNC_MM 8, 16, 0, mmx2 |

116 |
WEIGHT_FUNC_MM 8, 8, 0, mmx2 |

117 |
WEIGHT_FUNC_MM 8, 4, 0, mmx2 |

118 |
INIT_XMM |

119 |
WEIGHT_FUNC_MM 16, 16, 8, sse2 |

120 |
WEIGHT_FUNC_MM 16, 8, 8, sse2 |

121 | |

122 |
%macro WEIGHT_FUNC_HALF_MM 5 |

123 |
cglobal h264_weight_%1x%2_%5, 5, 5, %4 |

124 |
WEIGHT_SETUP |

125 |
mov r2, %2/2 |

126 |
lea r3, [r1*2] |

127 |
%if %2 == mmsize |

128 |
.nextrow |

129 |
WEIGHT_OP 0, r1 |

130 |
movh [r0], m0 |

131 |
%if mmsize == 16 |

132 |
movhps [r0+r1], m0 |

133 |
%else |

134 |
psrlq m0, 32 |

135 |
movh [r0+r1], m0 |

136 |
%endif |

137 |
add r0, r3 |

138 |
dec r2 |

139 |
jnz .nextrow |

140 |
REP_RET |

141 |
%else |

142 |
jmp mangle(ff_h264_weight_%1x%3_%5.nextrow) |

143 |
%endif |

144 |
%endmacro |

145 | |

146 |
INIT_MMX |

147 |
WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 |

148 |
WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 |

149 |
WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 |

150 |
INIT_XMM |

151 |
WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 |

152 |
WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 |

153 |
WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 |

154 | |

155 |
%macro BIWEIGHT_SETUP 0 |

156 |
add r6, 1 |

157 |
or r6, 1 |

158 |
add r3, 1 |

159 |
movd m3, r4d |

160 |
movd m4, r5d |

161 |
movd m5, r6d |

162 |
movd m6, r3d |

163 |
pslld m5, m6 |

164 |
psrld m5, 1 |

165 |
%if mmsize == 16 |

166 |
pshuflw m3, m3, 0 |

167 |
pshuflw m4, m4, 0 |

168 |
pshuflw m5, m5, 0 |

169 |
punpcklqdq m3, m3 |

170 |
punpcklqdq m4, m4 |

171 |
punpcklqdq m5, m5 |

172 |
%else |

173 |
pshufw m3, m3, 0 |

174 |
pshufw m4, m4, 0 |

175 |
pshufw m5, m5, 0 |

176 |
%endif |

177 |
pxor m7, m7 |

178 |
%endmacro |

179 | |

180 |
%macro BIWEIGHT_STEPA 3 |

181 |
movh m%1, [r0+%3] |

182 |
movh m%2, [r1+%3] |

183 |
punpcklbw m%1, m7 |

184 |
punpcklbw m%2, m7 |

185 |
pmullw m%1, m3 |

186 |
pmullw m%2, m4 |

187 |
paddsw m%1, m%2 |

188 |
%endmacro |

189 | |

190 |
%macro BIWEIGHT_STEPB 0 |

191 |
paddsw m0, m5 |

192 |
paddsw m1, m5 |

193 |
psraw m0, m6 |

194 |
psraw m1, m6 |

195 |
packuswb m0, m1 |

196 |
%endmacro |

197 | |

198 |
%macro BIWEIGHT_FUNC_DBL_MM 1 |

199 |
cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 |

200 |
BIWEIGHT_SETUP |

201 |
mov r3, %1 |

202 |
%if %1 == 16 |

203 |
.nextrow |

204 |
BIWEIGHT_STEPA 0, 1, 0 |

205 |
BIWEIGHT_STEPA 1, 2, 4 |

206 |
BIWEIGHT_STEPB |

207 |
mova [r0], m0 |

208 |
BIWEIGHT_STEPA 0, 1, 8 |

209 |
BIWEIGHT_STEPA 1, 2, 12 |

210 |
BIWEIGHT_STEPB |

211 |
mova [r0+8], m0 |

212 |
add r0, r2 |

213 |
add r1, r2 |

214 |
dec r3 |

215 |
jnz .nextrow |

216 |
REP_RET |

217 |
%else |

218 |
jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow) |

219 |
%endif |

220 |
%endmacro |

221 | |

222 |
INIT_MMX |

223 |
BIWEIGHT_FUNC_DBL_MM 16 |

224 |
BIWEIGHT_FUNC_DBL_MM 8 |

225 | |

226 |
%macro BIWEIGHT_FUNC_MM 4 |

227 |
cglobal h264_biweight_%1x%2_%4, 7, 7, %3 |

228 |
BIWEIGHT_SETUP |

229 |
mov r3, %2 |

230 |
%if %2 == 16 |

231 |
.nextrow |

232 |
BIWEIGHT_STEPA 0, 1, 0 |

233 |
BIWEIGHT_STEPA 1, 2, mmsize/2 |

234 |
BIWEIGHT_STEPB |

235 |
mova [r0], m0 |

236 |
add r0, r2 |

237 |
add r1, r2 |

238 |
dec r3 |

239 |
jnz .nextrow |

240 |
REP_RET |

241 |
%else |

242 |
jmp mangle(ff_h264_biweight_%1x16_%4.nextrow) |

243 |
%endif |

244 |
%endmacro |

245 | |

246 |
INIT_MMX |

247 |
BIWEIGHT_FUNC_MM 8, 16, 0, mmx2 |

248 |
BIWEIGHT_FUNC_MM 8, 8, 0, mmx2 |

249 |
BIWEIGHT_FUNC_MM 8, 4, 0, mmx2 |

250 |
INIT_XMM |

251 |
BIWEIGHT_FUNC_MM 16, 16, 8, sse2 |

252 |
BIWEIGHT_FUNC_MM 16, 8, 8, sse2 |

253 | |

254 |
%macro BIWEIGHT_FUNC_HALF_MM 5 |

255 |
cglobal h264_biweight_%1x%2_%5, 7, 7, %4 |

256 |
BIWEIGHT_SETUP |

257 |
mov r3, %2/2 |

258 |
lea r4, [r2*2] |

259 |
%if %2 == mmsize |

260 |
.nextrow |

261 |
BIWEIGHT_STEPA 0, 1, 0 |

262 |
BIWEIGHT_STEPA 1, 2, r2 |

263 |
BIWEIGHT_STEPB |

264 |
movh [r0], m0 |

265 |
%if mmsize == 16 |

266 |
movhps [r0+r2], m0 |

267 |
%else |

268 |
psrlq m0, 32 |

269 |
movh [r0+r2], m0 |

270 |
%endif |

271 |
add r0, r4 |

272 |
add r1, r4 |

273 |
dec r3 |

274 |
jnz .nextrow |

275 |
REP_RET |

276 |
%else |

277 |
jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow) |

278 |
%endif |

279 |
%endmacro |

280 | |

281 |
INIT_MMX |

282 |
BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 |

283 |
BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 |

284 |
BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 |

285 |
INIT_XMM |

286 |
BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 |

287 |
BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 |

288 |
BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 |

289 | |

290 |
%macro BIWEIGHT_SSSE3_SETUP 0 |

291 |
add r6, 1 |

292 |
or r6, 1 |

293 |
add r3, 1 |

294 |
movd m4, r4d |

295 |
movd m0, r5d |

296 |
movd m5, r6d |

297 |
movd m6, r3d |

298 |
pslld m5, m6 |

299 |
psrld m5, 1 |

300 |
punpcklbw m4, m0 |

301 |
pshuflw m4, m4, 0 |

302 |
pshuflw m5, m5, 0 |

303 |
punpcklqdq m4, m4 |

304 |
punpcklqdq m5, m5 |

305 |
%endmacro |

306 | |

307 |
%macro BIWEIGHT_SSSE3_OP 0 |

308 |
pmaddubsw m0, m4 |

309 |
pmaddubsw m2, m4 |

310 |
paddsw m0, m5 |

311 |
paddsw m2, m5 |

312 |
psraw m0, m6 |

313 |
psraw m2, m6 |

314 |
packuswb m0, m2 |

315 |
%endmacro |

316 | |

317 |
%macro BIWEIGHT_SSSE3_16 1 |

318 |
cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 |

319 |
BIWEIGHT_SSSE3_SETUP |

320 |
mov r3, %1 |

321 | |

322 |
%if %1 == 16 |

323 |
.nextrow |

324 |
movh m0, [r0] |

325 |
movh m2, [r0+8] |

326 |
movh m3, [r1+8] |

327 |
punpcklbw m0, [r1] |

328 |
punpcklbw m2, m3 |

329 |
BIWEIGHT_SSSE3_OP |

330 |
mova [r0], m0 |

331 |
add r0, r2 |

332 |
add r1, r2 |

333 |
dec r3 |

334 |
jnz .nextrow |

335 |
REP_RET |

336 |
%else |

337 |
jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow) |

338 |
%endif |

339 |
%endmacro |

340 | |

341 |
INIT_XMM |

342 |
BIWEIGHT_SSSE3_16 16 |

343 |
BIWEIGHT_SSSE3_16 8 |

344 | |

345 |
%macro BIWEIGHT_SSSE3_8 1 |

346 |
cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 |

347 |
BIWEIGHT_SSSE3_SETUP |

348 |
mov r3, %1/2 |

349 |
lea r4, [r2*2] |

350 | |

351 |
%if %1 == 16 |

352 |
.nextrow |

353 |
movh m0, [r0] |

354 |
movh m1, [r1] |

355 |
movh m2, [r0+r2] |

356 |
movh m3, [r1+r2] |

357 |
punpcklbw m0, m1 |

358 |
punpcklbw m2, m3 |

359 |
BIWEIGHT_SSSE3_OP |

360 |
movh [r0], m0 |

361 |
movhps [r0+r2], m0 |

362 |
add r0, r4 |

363 |
add r1, r4 |

364 |
dec r3 |

365 |
jnz .nextrow |

366 |
REP_RET |

367 |
%else |

368 |
jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow) |

369 |
%endif |

370 |
%endmacro |

371 | |

372 |
INIT_XMM |

373 |
BIWEIGHT_SSSE3_8 16 |

374 |
BIWEIGHT_SSSE3_8 8 |

375 |
BIWEIGHT_SSSE3_8 4 |