vpx_convolve_copy_sse2.asm 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "third_party/x86inc/x86inc.asm"
  11. SECTION .text
  12. %macro convolve_fn 1-2
  13. %ifidn %1, avg
  14. %define AUX_XMM_REGS 4
  15. %else
  16. %define AUX_XMM_REGS 0
  17. %endif
  18. %ifidn %2, highbd
  19. %define pavg pavgw
  20. cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
  21. dst, dst_stride, \
  22. fx, fxs, fy, fys, w, h, bd
  23. %else
  24. %define pavg pavgb
  25. cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
  26. dst, dst_stride, \
  27. fx, fxs, fy, fys, w, h
  28. %endif
  29. mov r4d, dword wm
  30. %ifidn %2, highbd
  31. shl r4d, 1
  32. shl srcq, 1
  33. shl src_strideq, 1
  34. shl dstq, 1
  35. shl dst_strideq, 1
  36. %else
  37. cmp r4d, 4
  38. je .w4
  39. %endif
  40. cmp r4d, 8
  41. je .w8
  42. cmp r4d, 16
  43. je .w16
  44. cmp r4d, 32
  45. je .w32
  46. %ifidn %2, highbd
  47. cmp r4d, 64
  48. je .w64
  49. mov r4d, dword hm
  50. .loop128:
  51. movu m0, [srcq]
  52. movu m1, [srcq+16]
  53. movu m2, [srcq+32]
  54. movu m3, [srcq+48]
  55. %ifidn %1, avg
  56. pavg m0, [dstq]
  57. pavg m1, [dstq+16]
  58. pavg m2, [dstq+32]
  59. pavg m3, [dstq+48]
  60. %endif
  61. mova [dstq ], m0
  62. mova [dstq+16], m1
  63. mova [dstq+32], m2
  64. mova [dstq+48], m3
  65. movu m0, [srcq+64]
  66. movu m1, [srcq+80]
  67. movu m2, [srcq+96]
  68. movu m3, [srcq+112]
  69. add srcq, src_strideq
  70. %ifidn %1, avg
  71. pavg m0, [dstq+64]
  72. pavg m1, [dstq+80]
  73. pavg m2, [dstq+96]
  74. pavg m3, [dstq+112]
  75. %endif
  76. mova [dstq+64], m0
  77. mova [dstq+80], m1
  78. mova [dstq+96], m2
  79. mova [dstq+112], m3
  80. add dstq, dst_strideq
  81. dec r4d
  82. jnz .loop128
  83. RET
  84. %endif
  85. .w64
  86. mov r4d, dword hm
  87. .loop64:
  88. movu m0, [srcq]
  89. movu m1, [srcq+16]
  90. movu m2, [srcq+32]
  91. movu m3, [srcq+48]
  92. add srcq, src_strideq
  93. %ifidn %1, avg
  94. pavg m0, [dstq]
  95. pavg m1, [dstq+16]
  96. pavg m2, [dstq+32]
  97. pavg m3, [dstq+48]
  98. %endif
  99. mova [dstq ], m0
  100. mova [dstq+16], m1
  101. mova [dstq+32], m2
  102. mova [dstq+48], m3
  103. add dstq, dst_strideq
  104. dec r4d
  105. jnz .loop64
  106. RET
  107. .w32:
  108. mov r4d, dword hm
  109. .loop32:
  110. movu m0, [srcq]
  111. movu m1, [srcq+16]
  112. movu m2, [srcq+src_strideq]
  113. movu m3, [srcq+src_strideq+16]
  114. lea srcq, [srcq+src_strideq*2]
  115. %ifidn %1, avg
  116. pavg m0, [dstq]
  117. pavg m1, [dstq +16]
  118. pavg m2, [dstq+dst_strideq]
  119. pavg m3, [dstq+dst_strideq+16]
  120. %endif
  121. mova [dstq ], m0
  122. mova [dstq +16], m1
  123. mova [dstq+dst_strideq ], m2
  124. mova [dstq+dst_strideq+16], m3
  125. lea dstq, [dstq+dst_strideq*2]
  126. sub r4d, 2
  127. jnz .loop32
  128. RET
  129. .w16:
  130. mov r4d, dword hm
  131. lea r5q, [src_strideq*3]
  132. lea r6q, [dst_strideq*3]
  133. .loop16:
  134. movu m0, [srcq]
  135. movu m1, [srcq+src_strideq]
  136. movu m2, [srcq+src_strideq*2]
  137. movu m3, [srcq+r5q]
  138. lea srcq, [srcq+src_strideq*4]
  139. %ifidn %1, avg
  140. pavg m0, [dstq]
  141. pavg m1, [dstq+dst_strideq]
  142. pavg m2, [dstq+dst_strideq*2]
  143. pavg m3, [dstq+r6q]
  144. %endif
  145. mova [dstq ], m0
  146. mova [dstq+dst_strideq ], m1
  147. mova [dstq+dst_strideq*2], m2
  148. mova [dstq+r6q ], m3
  149. lea dstq, [dstq+dst_strideq*4]
  150. sub r4d, 4
  151. jnz .loop16
  152. RET
  153. .w8:
  154. mov r4d, dword hm
  155. lea r5q, [src_strideq*3]
  156. lea r6q, [dst_strideq*3]
  157. .loop8:
  158. movh m0, [srcq]
  159. movh m1, [srcq+src_strideq]
  160. movh m2, [srcq+src_strideq*2]
  161. movh m3, [srcq+r5q]
  162. lea srcq, [srcq+src_strideq*4]
  163. %ifidn %1, avg
  164. movh m4, [dstq]
  165. movh m5, [dstq+dst_strideq]
  166. movh m6, [dstq+dst_strideq*2]
  167. movh m7, [dstq+r6q]
  168. pavg m0, m4
  169. pavg m1, m5
  170. pavg m2, m6
  171. pavg m3, m7
  172. %endif
  173. movh [dstq ], m0
  174. movh [dstq+dst_strideq ], m1
  175. movh [dstq+dst_strideq*2], m2
  176. movh [dstq+r6q ], m3
  177. lea dstq, [dstq+dst_strideq*4]
  178. sub r4d, 4
  179. jnz .loop8
  180. RET
  181. %ifnidn %2, highbd
  182. .w4:
  183. mov r4d, dword hm
  184. lea r5q, [src_strideq*3]
  185. lea r6q, [dst_strideq*3]
  186. .loop4:
  187. movd m0, [srcq]
  188. movd m1, [srcq+src_strideq]
  189. movd m2, [srcq+src_strideq*2]
  190. movd m3, [srcq+r5q]
  191. lea srcq, [srcq+src_strideq*4]
  192. %ifidn %1, avg
  193. movd m4, [dstq]
  194. movd m5, [dstq+dst_strideq]
  195. movd m6, [dstq+dst_strideq*2]
  196. movd m7, [dstq+r6q]
  197. pavg m0, m4
  198. pavg m1, m5
  199. pavg m2, m6
  200. pavg m3, m7
  201. %endif
  202. movd [dstq ], m0
  203. movd [dstq+dst_strideq ], m1
  204. movd [dstq+dst_strideq*2], m2
  205. movd [dstq+r6q ], m3
  206. lea dstq, [dstq+dst_strideq*4]
  207. sub r4d, 4
  208. jnz .loop4
  209. RET
  210. %endif
  211. %endmacro
  212. INIT_XMM sse2
  213. convolve_fn copy
  214. convolve_fn avg
  215. %if CONFIG_VP9_HIGHBITDEPTH
  216. convolve_fn copy, highbd
  217. convolve_fn avg, highbd
  218. %endif