memcpy.S 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. /*
  2. * Copyright 2017 Facebook, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /*
  17. * memcpy: An optimized memcpy implementation for x86_64. It uses AVX when
  18. * __AVX__ is defined, and uses SSE2 otherwise.
  19. *
  20. * @author Bin Liu <binliu@fb.com>
  21. */
  22. #if defined(__x86_64__) && defined(__linux__) && !defined(__CYGWIN__)
  23. .file "memcpy.S"
  24. .text
  25. /*
  26. * _memcpy_short is a local helper used when length < 8. It cannot be called
  27. * from outside, because it expects a non-standard calling convention:
  28. *
  29. * %rax: destination buffer address.
  30. * %rsi: source buffer address.
  31. * %edx: length, in the range of [0, 7]
  32. */
  33. .type _memcpy_short, @function
  34. _memcpy_short:
  35. .LSHORT:
  36. .cfi_startproc
  37. // if (length == 0) return;
  38. test %edx, %edx
  39. jz .LEND
  40. movzbl (%rsi), %ecx
  41. // if (length - 4 < 0) goto LS4;
  42. sub $4, %edx
  43. jb .LS4
  44. mov (%rsi), %ecx
  45. mov (%rsi, %rdx), %edi
  46. mov %ecx, (%rax)
  47. mov %edi, (%rax, %rdx)
  48. .LEND:
  49. rep
  50. ret
  51. nop
  52. .LS4:
  53. // At this point, length can be 1 or 2 or 3, and $cl contains
  54. // the first byte.
  55. mov %cl, (%rax)
  56. // if (length - 4 + 2 < 0) return;
  57. add $2, %edx
  58. jnc .LEND
  59. // length is 2 or 3 here. In either case, just copy the last
  60. // two bytes.
  61. movzwl (%rsi, %rdx), %ecx
  62. mov %cx, (%rax, %rdx)
  63. ret
  64. .cfi_endproc
  65. .size _memcpy_short, .-_memcpy_short
  66. /*
  67. * void* memcpy(void* dst, void* src, uint32_t length);
  68. *
  69. */
  70. .align 16
  71. .globl memcpy
  72. .type memcpy, @function
  73. memcpy:
  74. .cfi_startproc
  75. mov %rdx, %rcx
  76. mov %rdi, %rax
  77. cmp $8, %rdx
  78. jb .LSHORT
  79. mov -8(%rsi, %rdx), %r8
  80. mov (%rsi), %r9
  81. mov %r8, -8(%rdi, %rdx)
  82. and $24, %rcx
  83. jz .L32
  84. mov %r9, (%rdi)
  85. mov %rcx, %r8
  86. sub $16, %rcx
  87. jb .LT32
  88. #ifndef __AVX__
  89. movdqu (%rsi, %rcx), %xmm1
  90. movdqu %xmm1, (%rdi, %rcx)
  91. #else
  92. vmovdqu (%rsi, %rcx), %xmm1
  93. vmovdqu %xmm1, (%rdi, %rcx)
  94. #endif
  95. // Test if there are 32-byte groups
  96. .LT32:
  97. add %r8, %rsi
  98. and $-32, %rdx
  99. jnz .L32_adjDI
  100. ret
  101. .align 16
  102. .L32_adjDI:
  103. add %r8, %rdi
  104. .L32:
  105. #ifndef __AVX__
  106. movdqu (%rsi), %xmm0
  107. movdqu 16(%rsi), %xmm1
  108. #else
  109. vmovdqu (%rsi), %ymm0
  110. #endif
  111. shr $6, %rdx
  112. jnc .L64_32read
  113. #ifndef __AVX__
  114. movdqu %xmm0, (%rdi)
  115. movdqu %xmm1, 16(%rdi)
  116. #else
  117. vmovdqu %ymm0, (%rdi)
  118. #endif
  119. lea 32(%rsi), %rsi
  120. jnz .L64_adjDI
  121. #ifdef __AVX__
  122. vzeroupper
  123. #endif
  124. ret
  125. .L64_adjDI:
  126. add $32, %rdi
  127. .L64:
  128. #ifndef __AVX__
  129. movdqu (%rsi), %xmm0
  130. movdqu 16(%rsi), %xmm1
  131. #else
  132. vmovdqu (%rsi), %ymm0
  133. #endif
  134. .L64_32read:
  135. #ifndef __AVX__
  136. movdqu 32(%rsi), %xmm2
  137. movdqu 48(%rsi), %xmm3
  138. add $64, %rsi
  139. movdqu %xmm0, (%rdi)
  140. movdqu %xmm1, 16(%rdi)
  141. movdqu %xmm2, 32(%rdi)
  142. movdqu %xmm3, 48(%rdi)
  143. #else
  144. vmovdqu 32(%rsi), %ymm1
  145. add $64, %rsi
  146. vmovdqu %ymm0, (%rdi)
  147. vmovdqu %ymm1, 32(%rdi)
  148. #endif
  149. add $64, %rdi
  150. dec %rdx
  151. jnz .L64
  152. #ifdef __AVX__
  153. vzeroupper
  154. #endif
  155. ret
  156. .cfi_endproc
  157. .size memcpy, .-memcpy
  158. #endif